[libvirt] [PATCH 0/7] qemu: PCI bridge support

This series allows PCI bridges to be added and used. After patch 6/7 more buses are usable if the bridges and addresses are explicitly specified in the XML. After 7/7 bridges are auto-added and the new buses are used automatically. (This only works if there is enough space on bus 0 for the bridges, otherwise they need to be specified manually). (Documentation, tests and bridge hotplug is still missing) Contains slightly modified li guang's patches 1-3: https://www.redhat.com/archives/libvir-list/2013-February/msg00981.html and reworked address allocation for multiple buses https://www.redhat.com/archives/libvir-list/2013-February/msg00793.html Ján Tomko (5): qemu: QEMU_PCI constant consistency qemu: move PCI address check out of qemuPCIAddressAsString qemu: switch PCI address set from hash table to an array qemu: Add support for plugging devices into PCI bridges qemu: auto-add and use bridges liguang (2): add pci-bridge controller type qemu: build command line for pci-bridge device docs/schemas/domaincommon.rng | 1 + src/conf/domain_conf.c | 3 +- src/conf/domain_conf.h | 1 + src/qemu/qemu_capabilities.c | 2 + src/qemu/qemu_capabilities.h | 1 + src/qemu/qemu_command.c | 551 ++++++++++++++++++++++++++++++------------ src/qemu/qemu_command.h | 4 +- tests/qemuhelptest.c | 21 +- 8 files changed, 421 insertions(+), 163 deletions(-) -- 1.8.1.5

Change QEMU_PCI_ADDRESS_LAST_SLOT to the number of slots in the bus, not the maximum slot value, to match QEMU_PCI_ADDRESS_LAST_FUNCTION. --- src/qemu/qemu_command.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c index 693d30d..8321dcd 100644 --- a/src/qemu/qemu_command.c +++ b/src/qemu/qemu_command.c @@ -1185,7 +1185,7 @@ cleanup: return ret; } -#define QEMU_PCI_ADDRESS_LAST_SLOT 31 +#define QEMU_PCI_ADDRESS_LAST_SLOT 32 #define QEMU_PCI_ADDRESS_LAST_FUNCTION 8 struct _qemuDomainPCIAddressSet { virHashTablePtr used; @@ -1536,8 +1536,8 @@ qemuDomainPCIAddressGetNextSlot(qemuDomainPCIAddressSetPtr addrs, char *addr; tmp_addr.slot++; - for (i = 0; i <= QEMU_PCI_ADDRESS_LAST_SLOT; i++, tmp_addr.slot++) { - if (QEMU_PCI_ADDRESS_LAST_SLOT < tmp_addr.slot) { + for (i = 0; i < QEMU_PCI_ADDRESS_LAST_SLOT; i++, tmp_addr.slot++) { + if (QEMU_PCI_ADDRESS_LAST_SLOT <= tmp_addr.slot) { tmp_addr.slot = 0; } -- 1.8.1.5

On 04/03/2013 11:50 AM, Ján Tomko wrote:
Change QEMU_PCI_ADDRESS_LAST_SLOT to the number of slots in the bus, not the maximum slot value, to match QEMU_PCI_ADDRESS_LAST_FUNCTION. ---
If you want to be *really* consistent, you should rename these to VIR_QEMU_PCI_ADDRESS_SLOT_LAST and VIR_QEMU_PCI_ADDRESS_FUNCTION_LAST :-) (i.e. start with "VIR_" and put "LAST" at the end)
src/qemu/qemu_command.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c index 693d30d..8321dcd 100644 --- a/src/qemu/qemu_command.c +++ b/src/qemu/qemu_command.c @@ -1185,7 +1185,7 @@ cleanup: return ret; }
-#define QEMU_PCI_ADDRESS_LAST_SLOT 31 +#define QEMU_PCI_ADDRESS_LAST_SLOT 32 #define QEMU_PCI_ADDRESS_LAST_FUNCTION 8 struct _qemuDomainPCIAddressSet { virHashTablePtr used; @@ -1536,8 +1536,8 @@ qemuDomainPCIAddressGetNextSlot(qemuDomainPCIAddressSetPtr addrs, char *addr;
tmp_addr.slot++; - for (i = 0; i <= QEMU_PCI_ADDRESS_LAST_SLOT; i++, tmp_addr.slot++) { - if (QEMU_PCI_ADDRESS_LAST_SLOT < tmp_addr.slot) { + for (i = 0; i < QEMU_PCI_ADDRESS_LAST_SLOT; i++, tmp_addr.slot++) { + if (QEMU_PCI_ADDRESS_LAST_SLOT <= tmp_addr.slot) { tmp_addr.slot = 0; }

On 04/04/2013 11:25 AM, Laine Stump wrote:
On 04/03/2013 11:50 AM, Ján Tomko wrote:
Change QEMU_PCI_ADDRESS_LAST_SLOT to the number of slots in the bus, not the maximum slot value, to match QEMU_PCI_ADDRESS_LAST_FUNCTION. ---
If you want to be *really* consistent, you should rename these to VIR_QEMU_PCI_ADDRESS_SLOT_LAST and VIR_QEMU_PCI_ADDRESS_FUNCTION_LAST :-)
(i.e. start with "VIR_" and put "LAST" at the end)
A VIR_ prefix might not be necessary, since this define is local to a qemu_ file; but yes, the bit about _LAST being a suffix in our various enum tail markers is a valid point. -- Eric Blake eblake redhat com +1-919-301-3266 Libvirt virtualization library http://libvirt.org

Move bus and domain checks from qemuPCIAddressAsString to a separate function and add a check for function and slot so that we can switch from a hash table to an array. Remove redundant checks in qemuBuildDeviceAddressStr. --- src/qemu/qemu_command.c | 111 +++++++++++++++++++++++++++++------------------- 1 file changed, 68 insertions(+), 43 deletions(-) diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c index 8321dcd..a16d5f1 100644 --- a/src/qemu/qemu_command.c +++ b/src/qemu/qemu_command.c @@ -1193,17 +1193,43 @@ struct _qemuDomainPCIAddressSet { }; +/* Check the PCI address + * Returns -1 if the address is unusable + * 0 if it's OK. + */ +static int qemuPCIAddressCheck(qemuDomainPCIAddressSetPtr addrs ATTRIBUTE_UNUSED, + virDevicePCIAddressPtr addr) +{ + if (addr->domain != 0) { + virReportError(VIR_ERR_XML_ERROR, "%s", + _("Only PCI domain 0 is available")); + return -1; + } + if (addr->bus != 0) { + virReportError(VIR_ERR_XML_ERROR, "%s", + _("Only PCI bus 0 is available")); + return -1; + } + if (addr->function >= QEMU_PCI_ADDRESS_LAST_FUNCTION) { + virReportError(VIR_ERR_XML_ERROR, + _("Invalid PCI address: function must be < %u"), + QEMU_PCI_ADDRESS_LAST_FUNCTION); + return -1; + } + if (addr->slot >= QEMU_PCI_ADDRESS_LAST_SLOT) { + virReportError(VIR_ERR_XML_ERROR, + _("Invalid PCI address: slot must be < %u"), + QEMU_PCI_ADDRESS_LAST_SLOT); + return -1; + } + return 0; +} + + static char *qemuPCIAddressAsString(virDevicePCIAddressPtr addr) { char *str; - if (addr->domain != 0 || - addr->bus != 0) { - virReportError(VIR_ERR_INTERNAL_ERROR, "%s", - _("Only PCI domain 0 and bus 0 are available")); - return NULL; - } - if (virAsprintf(&str, "%d:%d:%d.%d", addr->domain, addr->bus, @@ -1222,7 +1248,8 @@ static int qemuCollectPCIAddress(virDomainDefPtr def ATTRIBUTE_UNUSED, void *opaque) { int ret = -1; - char *addr = NULL; + char *str = NULL; + virDevicePCIAddressPtr addr = &info->addr.pci; qemuDomainPCIAddressSetPtr addrs = opaque; if ((info->type != VIR_DOMAIN_DEVICE_ADDRESS_TYPE_PCI) @@ -1235,57 +1262,60 @@ static int qemuCollectPCIAddress(virDomainDefPtr def ATTRIBUTE_UNUSED, return 0; } - addr = qemuPCIAddressAsString(&info->addr.pci); - if (!addr) + if (qemuPCIAddressCheck(addrs, addr) < 0) + return -1; + + str = qemuPCIAddressAsString(addr); + if (!str) goto cleanup; - if (virHashLookup(addrs->used, addr)) { + if (virHashLookup(addrs->used, str)) { if (info->addr.pci.function != 0) { virReportError(VIR_ERR_XML_ERROR, _("Attempted double use of PCI Address '%s' " "(may need \"multifunction='on'\" for device on function 0)"), - addr); + str); } else { virReportError(VIR_ERR_XML_ERROR, - _("Attempted double use of PCI Address '%s'"), addr); + _("Attempted double use of PCI Address '%s'"), str); } goto cleanup; } - VIR_DEBUG("Remembering PCI addr %s", addr); - if (virHashAddEntry(addrs->used, addr, addr) < 0) + VIR_DEBUG("Remembering PCI addr %s", str); + if (virHashAddEntry(addrs->used, str, str) < 0) goto cleanup; - addr = NULL; + str = NULL; if ((info->addr.pci.function == 0) && (info->addr.pci.multi != VIR_DEVICE_ADDRESS_PCI_MULTI_ON)) { /* a function 0 w/o multifunction=on must reserve the entire slot */ - virDevicePCIAddress tmp_addr = info->addr.pci; + virDevicePCIAddress tmp_addr = *addr; unsigned int *func = &tmp_addr.function; for (*func = 1; *func < QEMU_PCI_ADDRESS_LAST_FUNCTION; (*func)++) { - addr = qemuPCIAddressAsString(&tmp_addr); - if (!addr) + str = qemuPCIAddressAsString(&tmp_addr); + if (!str) goto cleanup; - if (virHashLookup(addrs->used, addr)) { + if (virHashLookup(addrs->used, str)) { virReportError(VIR_ERR_XML_ERROR, _("Attempted double use of PCI Address '%s' " "(need \"multifunction='off'\" for device " "on function 0)"), - addr); + str); goto cleanup; } - VIR_DEBUG("Remembering PCI addr %s (multifunction=off for function 0)", addr); - if (virHashAddEntry(addrs->used, addr, addr)) + VIR_DEBUG("Remembering PCI addr %s (multifunction=off for function 0)", str); + if (virHashAddEntry(addrs->used, str, str)) goto cleanup; - addr = NULL; + str = NULL; } } ret = 0; cleanup: - VIR_FREE(addr); + VIR_FREE(str); return ret; } @@ -1385,6 +1415,9 @@ static int qemuDomainPCIAddressCheckSlot(qemuDomainPCIAddressSetPtr addrs, virDevicePCIAddress tmp_addr = *addr; unsigned int *func = &(tmp_addr.function); + if (qemuPCIAddressCheck(addrs, addr) < 0) + return -1; + for (*func = 0; *func < QEMU_PCI_ADDRESS_LAST_FUNCTION; (*func)++) { str = qemuPCIAddressAsString(&tmp_addr); if (!str) @@ -1406,6 +1439,9 @@ int qemuDomainPCIAddressReserveAddr(qemuDomainPCIAddressSetPtr addrs, { char *str; + if (qemuPCIAddressCheck(addrs, addr) < 0) + return -1; + str = qemuPCIAddressAsString(addr); if (!str) return -1; @@ -1479,6 +1515,9 @@ int qemuDomainPCIAddressReleaseAddr(qemuDomainPCIAddressSetPtr addrs, char *str; int ret; + if (qemuPCIAddressCheck(addrs, addr) < 0) + return -1; + str = qemuPCIAddressAsString(addr); if (!str) return -1; @@ -1498,6 +1537,9 @@ int qemuDomainPCIAddressReleaseSlot(qemuDomainPCIAddressSetPtr addrs, virDevicePCIAddress tmp_addr = *addr; unsigned int *func = &tmp_addr.function; + if (qemuPCIAddressCheck(addrs, addr) < 0) + return -1; + for (*func = 0; *func < QEMU_PCI_ADDRESS_LAST_FUNCTION; (*func)++) { str = qemuPCIAddressAsString(&tmp_addr); if (!str) @@ -1965,24 +2007,7 @@ qemuBuildDeviceAddressStr(virBufferPtr buf, virQEMUCapsPtr qemuCaps) { if (info->type == VIR_DOMAIN_DEVICE_ADDRESS_TYPE_PCI) { - if (info->addr.pci.domain != 0) { - virReportError(VIR_ERR_INTERNAL_ERROR, "%s", - _("Only PCI device addresses with domain=0 are supported")); - return -1; - } - if (info->addr.pci.bus != 0) { - virReportError(VIR_ERR_INTERNAL_ERROR, "%s", - _("Only PCI device addresses with bus=0 are supported")); - return -1; - } - if (virQEMUCapsGet(qemuCaps, QEMU_CAPS_PCI_MULTIFUNCTION)) { - if (info->addr.pci.function > 7) { - virReportError(VIR_ERR_INTERNAL_ERROR, "%s", - _("The function of PCI device addresses must " - "be less than 8")); - return -1; - } - } else { + if (!virQEMUCapsGet(qemuCaps, QEMU_CAPS_PCI_MULTIFUNCTION)) { if (info->addr.pci.function != 0) { virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", _("Only PCI device addresses with function=0 " -- 1.8.1.5

On 04/03/2013 11:50 AM, Ján Tomko wrote:
Move bus and domain checks from qemuPCIAddressAsString to a separate function and add a check for function and slot so that we can switch from a hash table to an array.
Remove redundant checks in qemuBuildDeviceAddressStr. --- src/qemu/qemu_command.c | 111 +++++++++++++++++++++++++++++------------------- 1 file changed, 68 insertions(+), 43 deletions(-)
diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c index 8321dcd..a16d5f1 100644 --- a/src/qemu/qemu_command.c +++ b/src/qemu/qemu_command.c @@ -1193,17 +1193,43 @@ struct _qemuDomainPCIAddressSet { };
+/* Check the PCI address + * Returns -1 if the address is unusable + * 0 if it's OK. + */ +static int qemuPCIAddressCheck(qemuDomainPCIAddressSetPtr addrs ATTRIBUTE_UNUSED, + virDevicePCIAddressPtr addr)
How about naming this qemuPCIAddressValidate()? (This is especially good since the verb "Check" is used elsewhere in this file to mean "check to see if this is *in use*")
+{ + if (addr->domain != 0) { + virReportError(VIR_ERR_XML_ERROR, "%s", + _("Only PCI domain 0 is available")); + return -1; + } + if (addr->bus != 0) { + virReportError(VIR_ERR_XML_ERROR, "%s", + _("Only PCI bus 0 is available")); + return -1; + } + if (addr->function >= QEMU_PCI_ADDRESS_LAST_FUNCTION) { + virReportError(VIR_ERR_XML_ERROR, + _("Invalid PCI address: function must be < %u"), + QEMU_PCI_ADDRESS_LAST_FUNCTION); + return -1; + } + if (addr->slot >= QEMU_PCI_ADDRESS_LAST_SLOT) { + virReportError(VIR_ERR_XML_ERROR, + _("Invalid PCI address: slot must be < %u"), + QEMU_PCI_ADDRESS_LAST_SLOT); + return -1; + } + return 0; +} + + static char *qemuPCIAddressAsString(virDevicePCIAddressPtr addr) { char *str;
- if (addr->domain != 0 || - addr->bus != 0) { - virReportError(VIR_ERR_INTERNAL_ERROR, "%s", - _("Only PCI domain 0 and bus 0 are available")); - return NULL; - } -
Yes, definitely by the time we are converting this to a string it should have already been validated.
if (virAsprintf(&str, "%d:%d:%d.%d", addr->domain, addr->bus, @@ -1222,7 +1248,8 @@ static int qemuCollectPCIAddress(virDomainDefPtr def ATTRIBUTE_UNUSED, void *opaque) { int ret = -1; - char *addr = NULL; + char *str = NULL; + virDevicePCIAddressPtr addr = &info->addr.pci; qemuDomainPCIAddressSetPtr addrs = opaque;
if ((info->type != VIR_DOMAIN_DEVICE_ADDRESS_TYPE_PCI) @@ -1235,57 +1262,60 @@ static int qemuCollectPCIAddress(virDomainDefPtr def ATTRIBUTE_UNUSED, return 0; }
- addr = qemuPCIAddressAsString(&info->addr.pci); - if (!addr) + if (qemuPCIAddressCheck(addrs, addr) < 0) + return -1; + + str = qemuPCIAddressAsString(addr); + if (!str) goto cleanup;
I prefer putting the assignment into the if condition: if (!(str = qemuPCIAddressAsString(addr))) goto cleanup;
- if (virHashLookup(addrs->used, addr)) { + if (virHashLookup(addrs->used, str)) { if (info->addr.pci.function != 0) { virReportError(VIR_ERR_XML_ERROR, _("Attempted double use of PCI Address '%s' " "(may need \"multifunction='on'\" for device on function 0)"), - addr); + str); } else { virReportError(VIR_ERR_XML_ERROR, - _("Attempted double use of PCI Address '%s'"), addr); + _("Attempted double use of PCI Address '%s'"), str); } goto cleanup; }
- VIR_DEBUG("Remembering PCI addr %s", addr); - if (virHashAddEntry(addrs->used, addr, addr) < 0) + VIR_DEBUG("Remembering PCI addr %s", str); + if (virHashAddEntry(addrs->used, str, str) < 0) goto cleanup; - addr = NULL; + str = NULL;
if ((info->addr.pci.function == 0) && (info->addr.pci.multi != VIR_DEVICE_ADDRESS_PCI_MULTI_ON)) { /* a function 0 w/o multifunction=on must reserve the entire slot */ - virDevicePCIAddress tmp_addr = info->addr.pci; + virDevicePCIAddress tmp_addr = *addr; unsigned int *func = &tmp_addr.function;
for (*func = 1; *func < QEMU_PCI_ADDRESS_LAST_FUNCTION; (*func)++) { - addr = qemuPCIAddressAsString(&tmp_addr); - if (!addr) + str = qemuPCIAddressAsString(&tmp_addr); + if (!str) goto cleanup;
Again, as long as you're modifying the lines, might as well stuff the assignment into the if condition.
- if (virHashLookup(addrs->used, addr)) { + if (virHashLookup(addrs->used, str)) { virReportError(VIR_ERR_XML_ERROR, _("Attempted double use of PCI Address '%s' " "(need \"multifunction='off'\" for device " "on function 0)"), - addr); + str); goto cleanup; }
- VIR_DEBUG("Remembering PCI addr %s (multifunction=off for function 0)", addr); - if (virHashAddEntry(addrs->used, addr, addr)) + VIR_DEBUG("Remembering PCI addr %s (multifunction=off for function 0)", str); + if (virHashAddEntry(addrs->used, str, str)) goto cleanup; - addr = NULL; + str = NULL; } } ret = 0; cleanup: - VIR_FREE(addr); + VIR_FREE(str); return ret; }
@@ -1385,6 +1415,9 @@ static int qemuDomainPCIAddressCheckSlot(qemuDomainPCIAddressSetPtr addrs,
I just noticed that the (existing) comment for this function isn't worded very well. As long as you're modifying things, could you fix that too? (just s/the other/another/g) Hmm, and now that I've suggested changing the name of qemuPCIAddressCheck because of this function using the word "Check" differently, I'm thinking *this* function could be better named as well. How about qemuDomainPCIAddressInUse()? Also, I think it should return true or false, not 0 or -1 (with associated adjustments in callers).
virDevicePCIAddress tmp_addr = *addr; unsigned int *func = &(tmp_addr.function);
+ if (qemuPCIAddressCheck(addrs, addr) < 0) + return -1; +
And as a matter of fact, I think you shouldn't be validating the PCI address here - in two of the 3 callers, a fixed hard-coded pci address is constructed (so you know that it's always valid), and in the 3rd caller, it's being done inside a loop whose index self-limits the PCI address to a valid range. (This is good, because if you left the call to the validation in here, you would have to have a tri-state return value to allow for failure as well as inuse/free).
for (*func = 0; *func < QEMU_PCI_ADDRESS_LAST_FUNCTION; (*func)++) { str = qemuPCIAddressAsString(&tmp_addr); if (!str) @@ -1406,6 +1439,9 @@ int qemuDomainPCIAddressReserveAddr(qemuDomainPCIAddressSetPtr addrs, { char *str;
+ if (qemuPCIAddressCheck(addrs, addr) < 0) + return -1; + str = qemuPCIAddressAsString(addr); if (!str) return -1; @@ -1479,6 +1515,9 @@ int qemuDomainPCIAddressReleaseAddr(qemuDomainPCIAddressSetPtr addrs, char *str; int ret;
+ if (qemuPCIAddressCheck(addrs, addr) < 0) + return -1; + str = qemuPCIAddressAsString(addr); if (!str) return -1; @@ -1498,6 +1537,9 @@ int qemuDomainPCIAddressReleaseSlot(qemuDomainPCIAddressSetPtr addrs, virDevicePCIAddress tmp_addr = *addr; unsigned int *func = &tmp_addr.function;
+ if (qemuPCIAddressCheck(addrs, addr) < 0) + return -1; + for (*func = 0; *func < QEMU_PCI_ADDRESS_LAST_FUNCTION; (*func)++) { str = qemuPCIAddressAsString(&tmp_addr); if (!str) @@ -1965,24 +2007,7 @@ qemuBuildDeviceAddressStr(virBufferPtr buf, virQEMUCapsPtr qemuCaps) { if (info->type == VIR_DOMAIN_DEVICE_ADDRESS_TYPE_PCI) { - if (info->addr.pci.domain != 0) { - virReportError(VIR_ERR_INTERNAL_ERROR, "%s", - _("Only PCI device addresses with domain=0 are supported")); - return -1; - } - if (info->addr.pci.bus != 0) { - virReportError(VIR_ERR_INTERNAL_ERROR, "%s", - _("Only PCI device addresses with bus=0 are supported")); - return -1; - } - if (virQEMUCapsGet(qemuCaps, QEMU_CAPS_PCI_MULTIFUNCTION)) { - if (info->addr.pci.function > 7) { - virReportError(VIR_ERR_INTERNAL_ERROR, "%s", - _("The function of PCI device addresses must " - "be less than 8")); - return -1; - } - } else { + if (!virQEMUCapsGet(qemuCaps, QEMU_CAPS_PCI_MULTIFUNCTION)) { if (info->addr.pci.function != 0) { virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", _("Only PCI device addresses with function=0 "
Looks fine aside from the nits I listed above.

Each bus (just one so far) is represented by an array with 32 slots where each slot is stored as an 8-bit integer where each bit represents a function. This makes operations with whole slots easier. --- src/qemu/qemu_command.c | 152 +++++++++++++++--------------------------------- 1 file changed, 48 insertions(+), 104 deletions(-) diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c index a16d5f1..e221c82 100644 --- a/src/qemu/qemu_command.c +++ b/src/qemu/qemu_command.c @@ -1187,8 +1187,14 @@ cleanup: #define QEMU_PCI_ADDRESS_LAST_SLOT 32 #define QEMU_PCI_ADDRESS_LAST_FUNCTION 8 + +/* + * Each bit represents a function + * Each byte represents a slot + */ +typedef uint8_t _qemuDomainPCIAddressBus[QEMU_PCI_ADDRESS_LAST_SLOT]; struct _qemuDomainPCIAddressSet { - virHashTablePtr used; + _qemuDomainPCIAddressBus *used; virDevicePCIAddress lastaddr; }; @@ -1269,7 +1275,7 @@ static int qemuCollectPCIAddress(virDomainDefPtr def ATTRIBUTE_UNUSED, if (!str) goto cleanup; - if (virHashLookup(addrs->used, str)) { + if (qemuDomainPCIAddressReserveAddr(addrs, addr) < 0) { if (info->addr.pci.function != 0) { virReportError(VIR_ERR_XML_ERROR, _("Attempted double use of PCI Address '%s' " @@ -1282,36 +1288,21 @@ static int qemuCollectPCIAddress(virDomainDefPtr def ATTRIBUTE_UNUSED, goto cleanup; } - VIR_DEBUG("Remembering PCI addr %s", str); - if (virHashAddEntry(addrs->used, str, str) < 0) - goto cleanup; - str = NULL; - if ((info->addr.pci.function == 0) && (info->addr.pci.multi != VIR_DEVICE_ADDRESS_PCI_MULTI_ON)) { /* a function 0 w/o multifunction=on must reserve the entire slot */ - virDevicePCIAddress tmp_addr = *addr; - unsigned int *func = &tmp_addr.function; - - for (*func = 1; *func < QEMU_PCI_ADDRESS_LAST_FUNCTION; (*func)++) { - str = qemuPCIAddressAsString(&tmp_addr); - if (!str) - goto cleanup; - - if (virHashLookup(addrs->used, str)) { - virReportError(VIR_ERR_XML_ERROR, - _("Attempted double use of PCI Address '%s' " - "(need \"multifunction='off'\" for device " - "on function 0)"), - str); - goto cleanup; - } - - VIR_DEBUG("Remembering PCI addr %s (multifunction=off for function 0)", str); - if (virHashAddEntry(addrs->used, str, str)) - goto cleanup; - str = NULL; + ignore_value(qemuDomainPCIAddressReleaseAddr(addrs, addr)); + if (qemuDomainPCIAddressReserveSlot(addrs, addr) < 0) { + virReportError(VIR_ERR_XML_ERROR, + _("Attempted double use of PCI Address '%s' " + "(need \"multifunction='off'\" for device " + "on function 0)"), + str); + goto cleanup; } + VIR_DEBUG("Remembering PCI slot: %s (multifunction=off)", str); + } else { + VIR_DEBUG("Remembering PCI addr: %s", str); } ret = 0; cleanup: @@ -1375,13 +1366,6 @@ int qemuDomainAssignAddresses(virDomainDefPtr def, return qemuDomainAssignPCIAddresses(def, qemuCaps, obj); } -static void -qemuDomainPCIAddressSetFreeEntry(void *payload, - const void *name ATTRIBUTE_UNUSED) -{ - VIR_FREE(payload); -} - qemuDomainPCIAddressSetPtr qemuDomainPCIAddressSetCreate(virDomainDefPtr def) { qemuDomainPCIAddressSetPtr addrs; @@ -1389,8 +1373,8 @@ qemuDomainPCIAddressSetPtr qemuDomainPCIAddressSetCreate(virDomainDefPtr def) if (VIR_ALLOC(addrs) < 0) goto no_memory; - if (!(addrs->used = virHashCreate(10, qemuDomainPCIAddressSetFreeEntry))) - goto error; + if (VIR_ALLOC_N(addrs->used, 1) < 0) + goto no_memory; if (virDomainDeviceInfoIterate(def, qemuCollectPCIAddress, addrs) < 0) goto error; @@ -1411,25 +1395,11 @@ error: static int qemuDomainPCIAddressCheckSlot(qemuDomainPCIAddressSetPtr addrs, virDevicePCIAddressPtr addr) { - char *str; - virDevicePCIAddress tmp_addr = *addr; - unsigned int *func = &(tmp_addr.function); - if (qemuPCIAddressCheck(addrs, addr) < 0) return -1; - for (*func = 0; *func < QEMU_PCI_ADDRESS_LAST_FUNCTION; (*func)++) { - str = qemuPCIAddressAsString(&tmp_addr); - if (!str) - return -1; - - if (virHashLookup(addrs->used, str)) { - VIR_FREE(str); - return -1; - } - - VIR_FREE(str); - } + if (addrs->used[addr->bus][addr->slot]) + return -1; return 0; } @@ -1448,42 +1418,46 @@ int qemuDomainPCIAddressReserveAddr(qemuDomainPCIAddressSetPtr addrs, VIR_DEBUG("Reserving PCI addr %s", str); - if (virHashLookup(addrs->used, str)) { + if (addrs->used[addr->bus][addr->slot] & 1 << addr->function) { virReportError(VIR_ERR_INTERNAL_ERROR, _("unable to reserve PCI address %s"), str); VIR_FREE(str); return -1; } - if (virHashAddEntry(addrs->used, str, str)) { - VIR_FREE(str); - return -1; - } + VIR_FREE(str); addrs->lastaddr = *addr; addrs->lastaddr.function = 0; addrs->lastaddr.multi = 0; + addrs->used[addr->bus][addr->slot] |= 1 << addr->function; return 0; } int qemuDomainPCIAddressReserveSlot(qemuDomainPCIAddressSetPtr addrs, virDevicePCIAddressPtr addr) { - virDevicePCIAddress tmp_addr = *addr; - unsigned int *func = &tmp_addr.function; - unsigned int last; + char *str; - for (*func = 0; *func < QEMU_PCI_ADDRESS_LAST_FUNCTION; (*func)++) { - if (qemuDomainPCIAddressReserveAddr(addrs, &tmp_addr) < 0) - goto cleanup; + if (qemuPCIAddressCheck(addrs, addr) < 0) + return -1; + + str = qemuPCIAddressAsString(addr); + if (!str) + return -1; + + VIR_DEBUG("Reserving PCI slot %s", str); + + if (addrs->used[addr->bus][addr->slot]) { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("unable to reserve PCI slot %s"), str); + VIR_FREE(str); + return -1; } + VIR_FREE(str); + addrs->used[addr->bus][addr->slot] = 0xFF; return 0; - -cleanup: - for (last = *func, *func = 0; *func < last; (*func)++) - qemuDomainPCIAddressReleaseAddr(addrs, &tmp_addr); - return -1; } int qemuDomainPCIAddressEnsureAddr(qemuDomainPCIAddressSetPtr addrs, @@ -1512,51 +1486,21 @@ int qemuDomainPCIAddressEnsureAddr(qemuDomainPCIAddressSetPtr addrs, int qemuDomainPCIAddressReleaseAddr(qemuDomainPCIAddressSetPtr addrs, virDevicePCIAddressPtr addr) { - char *str; - int ret; - if (qemuPCIAddressCheck(addrs, addr) < 0) return -1; - str = qemuPCIAddressAsString(addr); - if (!str) - return -1; - - ret = virHashRemoveEntry(addrs->used, str); - - VIR_FREE(str); - - return ret; + addrs->used[addr->bus][addr->slot] &= ~(1 << addr->function); + return 0; } int qemuDomainPCIAddressReleaseSlot(qemuDomainPCIAddressSetPtr addrs, virDevicePCIAddressPtr addr) { - char *str; - int ret = 0; - virDevicePCIAddress tmp_addr = *addr; - unsigned int *func = &tmp_addr.function; - if (qemuPCIAddressCheck(addrs, addr) < 0) return -1; - for (*func = 0; *func < QEMU_PCI_ADDRESS_LAST_FUNCTION; (*func)++) { - str = qemuPCIAddressAsString(&tmp_addr); - if (!str) - return -1; - - if (!virHashLookup(addrs->used, str)) { - VIR_FREE(str); - continue; - } - - VIR_FREE(str); - - if (qemuDomainPCIAddressReleaseAddr(addrs, &tmp_addr) < 0) - ret = -1; - } - - return ret; + addrs->used[addr->bus][addr->slot] = 0; + return 0; } void qemuDomainPCIAddressSetFree(qemuDomainPCIAddressSetPtr addrs) @@ -1564,7 +1508,7 @@ void qemuDomainPCIAddressSetFree(qemuDomainPCIAddressSetPtr addrs) if (!addrs) return; - virHashFree(addrs->used); + VIR_FREE(addrs->used); VIR_FREE(addrs); } -- 1.8.1.5

On 04/03/2013 11:50 AM, Ján Tomko wrote:
Each bus (just one so far) is represented by an array with 32 slots where each slot is stored as an 8-bit integer where each bit represents a function.
This makes operations with whole slots easier. --- src/qemu/qemu_command.c | 152 +++++++++++++++--------------------------------- 1 file changed, 48 insertions(+), 104 deletions(-)
diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c index a16d5f1..e221c82 100644 --- a/src/qemu/qemu_command.c +++ b/src/qemu/qemu_command.c @@ -1187,8 +1187,14 @@ cleanup:
#define QEMU_PCI_ADDRESS_LAST_SLOT 32 #define QEMU_PCI_ADDRESS_LAST_FUNCTION 8 + +/* + * Each bit represents a function + * Each byte represents a slot + */ +typedef uint8_t _qemuDomainPCIAddressBus[QEMU_PCI_ADDRESS_LAST_SLOT];
I'm not sure why _qemuDomainPCIAddressSet has a _ at the beginning, but in general I think we frown on prepending _ to definitions that are local to a file.
struct _qemuDomainPCIAddressSet { - virHashTablePtr used; + _qemuDomainPCIAddressBus *used; virDevicePCIAddress lastaddr; };
@@ -1269,7 +1275,7 @@ static int qemuCollectPCIAddress(virDomainDefPtr def ATTRIBUTE_UNUSED, if (!str) goto cleanup;
- if (virHashLookup(addrs->used, str)) { + if (qemuDomainPCIAddressReserveAddr(addrs, addr) < 0) { if (info->addr.pci.function != 0) { virReportError(VIR_ERR_XML_ERROR, _("Attempted double use of PCI Address '%s' " @@ -1282,36 +1288,21 @@ static int qemuCollectPCIAddress(virDomainDefPtr def ATTRIBUTE_UNUSED, goto cleanup; }
- VIR_DEBUG("Remembering PCI addr %s", str); - if (virHashAddEntry(addrs->used, str, str) < 0) - goto cleanup; - str = NULL; - if ((info->addr.pci.function == 0) && (info->addr.pci.multi != VIR_DEVICE_ADDRESS_PCI_MULTI_ON)) { /* a function 0 w/o multifunction=on must reserve the entire slot */ - virDevicePCIAddress tmp_addr = *addr; - unsigned int *func = &tmp_addr.function; - - for (*func = 1; *func < QEMU_PCI_ADDRESS_LAST_FUNCTION; (*func)++) { - str = qemuPCIAddressAsString(&tmp_addr); - if (!str) - goto cleanup; - - if (virHashLookup(addrs->used, str)) { - virReportError(VIR_ERR_XML_ERROR, - _("Attempted double use of PCI Address '%s' " - "(need \"multifunction='off'\" for device " - "on function 0)"), - str); - goto cleanup; - } - - VIR_DEBUG("Remembering PCI addr %s (multifunction=off for function 0)", str); - if (virHashAddEntry(addrs->used, str, str)) - goto cleanup; - str = NULL; + ignore_value(qemuDomainPCIAddressReleaseAddr(addrs, addr)); + if (qemuDomainPCIAddressReserveSlot(addrs, addr) < 0) { + virReportError(VIR_ERR_XML_ERROR, + _("Attempted double use of PCI Address '%s' " + "(need \"multifunction='off'\" for device " + "on function 0)"),
This message isn't exactly correct - str contains the address for function 0, but it's possible that it was one of the other functions that caused the problem. But there is yet another problem - qemuDomainPCIAddressReserveSlot() itself has already reported the error. You may want to have that function (and qemuDomainPCIAddressReserveAddr() not report any error, but rely on the caller to report the error (since the caller will have more context, such as (in this case) that multifunction was set to on).
+ str); + goto cleanup; } + VIR_DEBUG("Remembering PCI slot: %s (multifunction=off)", str); + } else { + VIR_DEBUG("Remembering PCI addr: %s", str); } ret = 0; cleanup: @@ -1375,13 +1366,6 @@ int qemuDomainAssignAddresses(virDomainDefPtr def, return qemuDomainAssignPCIAddresses(def, qemuCaps, obj); }
-static void -qemuDomainPCIAddressSetFreeEntry(void *payload, - const void *name ATTRIBUTE_UNUSED) -{ - VIR_FREE(payload); -} - qemuDomainPCIAddressSetPtr qemuDomainPCIAddressSetCreate(virDomainDefPtr def) { qemuDomainPCIAddressSetPtr addrs; @@ -1389,8 +1373,8 @@ qemuDomainPCIAddressSetPtr qemuDomainPCIAddressSetCreate(virDomainDefPtr def) if (VIR_ALLOC(addrs) < 0) goto no_memory;
- if (!(addrs->used = virHashCreate(10, qemuDomainPCIAddressSetFreeEntry))) - goto error; + if (VIR_ALLOC_N(addrs->used, 1) < 0) + goto no_memory;
if (virDomainDeviceInfoIterate(def, qemuCollectPCIAddress, addrs) < 0) goto error; @@ -1411,25 +1395,11 @@ error: static int qemuDomainPCIAddressCheckSlot(qemuDomainPCIAddressSetPtr addrs, virDevicePCIAddressPtr addr) { - char *str; - virDevicePCIAddress tmp_addr = *addr; - unsigned int *func = &(tmp_addr.function); - if (qemuPCIAddressCheck(addrs, addr) < 0) return -1;
- for (*func = 0; *func < QEMU_PCI_ADDRESS_LAST_FUNCTION; (*func)++) { - str = qemuPCIAddressAsString(&tmp_addr); - if (!str) - return -1; - - if (virHashLookup(addrs->used, str)) { - VIR_FREE(str); - return -1; - } - - VIR_FREE(str); - } + if (addrs->used[addr->bus][addr->slot]) + return -1;
return 0; } @@ -1448,42 +1418,46 @@ int qemuDomainPCIAddressReserveAddr(qemuDomainPCIAddressSetPtr addrs,
VIR_DEBUG("Reserving PCI addr %s", str);
- if (virHashLookup(addrs->used, str)) { + if (addrs->used[addr->bus][addr->slot] & 1 << addr->function) { virReportError(VIR_ERR_INTERNAL_ERROR, _("unable to reserve PCI address %s"), str); VIR_FREE(str); return -1; }
- if (virHashAddEntry(addrs->used, str, str)) { - VIR_FREE(str); - return -1; - } + VIR_FREE(str);
addrs->lastaddr = *addr; addrs->lastaddr.function = 0; addrs->lastaddr.multi = 0; + addrs->used[addr->bus][addr->slot] |= 1 << addr->function; return 0; }
int qemuDomainPCIAddressReserveSlot(qemuDomainPCIAddressSetPtr addrs, virDevicePCIAddressPtr addr) { - virDevicePCIAddress tmp_addr = *addr; - unsigned int *func = &tmp_addr.function; - unsigned int last; + char *str;
- for (*func = 0; *func < QEMU_PCI_ADDRESS_LAST_FUNCTION; (*func)++) { - if (qemuDomainPCIAddressReserveAddr(addrs, &tmp_addr) < 0) - goto cleanup; + if (qemuPCIAddressCheck(addrs, addr) < 0) + return -1; + + str = qemuPCIAddressAsString(addr); + if (!str) + return -1; + + VIR_DEBUG("Reserving PCI slot %s", str); + + if (addrs->used[addr->bus][addr->slot]) { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("unable to reserve PCI slot %s"), str); + VIR_FREE(str); + return -1; }
+ VIR_FREE(str); + addrs->used[addr->bus][addr->slot] = 0xFF; return 0; - -cleanup: - for (last = *func, *func = 0; *func < last; (*func)++) - qemuDomainPCIAddressReleaseAddr(addrs, &tmp_addr); - return -1; }
int qemuDomainPCIAddressEnsureAddr(qemuDomainPCIAddressSetPtr addrs, @@ -1512,51 +1486,21 @@ int qemuDomainPCIAddressEnsureAddr(qemuDomainPCIAddressSetPtr addrs, int qemuDomainPCIAddressReleaseAddr(qemuDomainPCIAddressSetPtr addrs, virDevicePCIAddressPtr addr) { - char *str; - int ret; - if (qemuPCIAddressCheck(addrs, addr) < 0) return -1;
- str = qemuPCIAddressAsString(addr); - if (!str) - return -1; - - ret = virHashRemoveEntry(addrs->used, str); - - VIR_FREE(str); - - return ret; + addrs->used[addr->bus][addr->slot] &= ~(1 << addr->function); + return 0; }
int qemuDomainPCIAddressReleaseSlot(qemuDomainPCIAddressSetPtr addrs, virDevicePCIAddressPtr addr) { - char *str; - int ret = 0; - virDevicePCIAddress tmp_addr = *addr; - unsigned int *func = &tmp_addr.function; - if (qemuPCIAddressCheck(addrs, addr) < 0) return -1;
- for (*func = 0; *func < QEMU_PCI_ADDRESS_LAST_FUNCTION; (*func)++) { - str = qemuPCIAddressAsString(&tmp_addr); - if (!str) - return -1; - - if (!virHashLookup(addrs->used, str)) { - VIR_FREE(str); - continue; - } - - VIR_FREE(str); - - if (qemuDomainPCIAddressReleaseAddr(addrs, &tmp_addr) < 0) - ret = -1; - } - - return ret; + addrs->used[addr->bus][addr->slot] = 0; + return 0; }
void qemuDomainPCIAddressSetFree(qemuDomainPCIAddressSetPtr addrs) @@ -1564,7 +1508,7 @@ void qemuDomainPCIAddressSetFree(qemuDomainPCIAddressSetPtr addrs) if (!addrs) return;
- virHashFree(addrs->used); + VIR_FREE(addrs->used); VIR_FREE(addrs); }
Otherwise looks okay.

From: liguang <lig.fnst@cn.fujitsu.com> add a new controller type, then one can define a pci-bridge controller like this: <controller type='pci-bridge' index='0'/> <controller type='pci-bridge' index='1'> <address type='pci' domain='0x0000' bus='0x00' slot='0x05' function='0x0'/> </controller> actually, it works as a pci-bus, so as to support multi-pci-bus via pci-to-pci bridge Signed-off-by: liguang <lig.fnst@cn.fujitsu.com> --- docs/schemas/domaincommon.rng | 1 + src/conf/domain_conf.c | 3 ++- src/conf/domain_conf.h | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/schemas/domaincommon.rng b/docs/schemas/domaincommon.rng index 8d7e6db..b6dc013 100644 --- a/docs/schemas/domaincommon.rng +++ b/docs/schemas/domaincommon.rng @@ -1357,6 +1357,7 @@ <value>sata</value> <value>ccid</value> <value>usb</value> + <value>pci-bridge</value> </choice> </attribute> </optional> diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index cc26f21..6a990bb 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -295,7 +295,8 @@ VIR_ENUM_IMPL(virDomainController, VIR_DOMAIN_CONTROLLER_TYPE_LAST, "sata", "virtio-serial", "ccid", - "usb") + "usb", + "pci-bridge") VIR_ENUM_IMPL(virDomainControllerModelSCSI, VIR_DOMAIN_CONTROLLER_MODEL_SCSI_LAST, "auto", diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h index edddf25..1ec8564 100644 --- a/src/conf/domain_conf.h +++ b/src/conf/domain_conf.h @@ -682,6 +682,7 @@ enum virDomainControllerType { VIR_DOMAIN_CONTROLLER_TYPE_VIRTIO_SERIAL, VIR_DOMAIN_CONTROLLER_TYPE_CCID, VIR_DOMAIN_CONTROLLER_TYPE_USB, + VIR_DOMAIN_CONTROLLER_TYPE_PCI_BRIDGE, VIR_DOMAIN_CONTROLLER_TYPE_LAST }; -- 1.8.1.5

On 04/03/2013 11:50 AM, Ján Tomko wrote:
From: liguang <lig.fnst@cn.fujitsu.com>
add a new controller type, then one can define a pci-bridge controller like this: <controller type='pci-bridge' index='0'/>
In the next patch we're prohibiting exactly this config (index='0') because the pre-existing pci bus on the "pc-*" machinetypes is already named pci.0. If we don't allow it, we shouldn't include it as an example in the commit log :-) More on this - one of the things this points out is that there is no representation in the config of the pci.0 bus, it's just assumed to always be there. That is the case for pc-* machinetypes (and probably several others with PCI buses), but for q35, there is no pci.0 bus in the basic machine, only a pcie.0; if you want a pci.0 on q35 (which *will* be necessary in order to attach any pci devices, so I imagine we will always want one), you have to attach a pcie->pci bridge, which is the device "i82801b11-bridge", to pcie.0. The reason I bring this up here, is I'm wondering: 1) should we have some representation of the default pci.0 bus in the config, even though it is just "always there" for the pc machinetypes and there is no way to disable it, and nothing on the commandline that specifies its existence? 2) For the q35 machinetype, should we just always add an i82801b11-bridge device and name it pci.0? Or should that need to be present in the xml? 3) Most important - depending on the answers to (1) and (2), should we maybe name this device "pci", and use a different backend depending on index and machinetype? (or alternately explicitly specifiable with a <driver> subelement). To be specific, we would have: <controller type='pci' index='0'/> which on pc machinetypes would just be a placeholder in the config (and always inserted if it wasn't there, for machinetypes that have a pci bus). On the q35 machinetype, that same line would equate to adding an i82801b11-bridge device (with source defaulting to bus=pcie.0,addr=1e.0). This would serve several purposes: a) on pc machinetypes, it would be a visual aid indicating that pci.0 exists, and that index='0' isn't available for a new pci controller. b) it would make switching a domain config from pc to q35 simpler, since pci.0 would always already be in place for attaching pci devices (including pci.1, pci.2, etc) c) it would make the config a true complete description of the machine being created. (I've suggested naming the controller "pci" rather than "pci-bridge" because in the case of a "root" bus like pci.0 it seems to not be a "bridge", but maybe the name "pci-bridge" is always appropriate, even when it's a root bus. Maybe someone with better pci/pcie knowledge can provide an opinion on this)
<controller type='pci-bridge' index='1'> <address type='pci' domain='0x0000' bus='0x00' slot='0x05' function='0x0'/> </controller> actually, it works as a pci-bus, so as to support multi-pci-bus via pci-to-pci bridge
Signed-off-by: liguang <lig.fnst@cn.fujitsu.com> --- docs/schemas/domaincommon.rng | 1 + src/conf/domain_conf.c | 3 ++- src/conf/domain_conf.h | 1 + 3 files changed, 4 insertions(+), 1 deletion(-)
diff --git a/docs/schemas/domaincommon.rng b/docs/schemas/domaincommon.rng index 8d7e6db..b6dc013 100644 --- a/docs/schemas/domaincommon.rng +++ b/docs/schemas/domaincommon.rng @@ -1357,6 +1357,7 @@ <value>sata</value> <value>ccid</value> <value>usb</value> + <value>pci-bridge</value> </choice> </attribute> </optional> diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index cc26f21..6a990bb 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -295,7 +295,8 @@ VIR_ENUM_IMPL(virDomainController, VIR_DOMAIN_CONTROLLER_TYPE_LAST, "sata", "virtio-serial", "ccid", - "usb") + "usb", + "pci-bridge")
VIR_ENUM_IMPL(virDomainControllerModelSCSI, VIR_DOMAIN_CONTROLLER_MODEL_SCSI_LAST, "auto", diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h index edddf25..1ec8564 100644 --- a/src/conf/domain_conf.h +++ b/src/conf/domain_conf.h @@ -682,6 +682,7 @@ enum virDomainControllerType { VIR_DOMAIN_CONTROLLER_TYPE_VIRTIO_SERIAL, VIR_DOMAIN_CONTROLLER_TYPE_CCID, VIR_DOMAIN_CONTROLLER_TYPE_USB, + VIR_DOMAIN_CONTROLLER_TYPE_PCI_BRIDGE,
VIR_DOMAIN_CONTROLLER_TYPE_LAST };
If nobody thinks making the name of the controller "pci" instead of "pci-bridge" makes sense, then ACK once the commit log has the bad example removed. (We'll need to make the default initial index be 1 for these instead of 0, but I think that should go in the next patch anyway, because as I said above, there may be a use for a pci controller with index='0').

On Fri, Apr 05, 2013 at 12:32:04PM -0400, Laine Stump wrote:
On 04/03/2013 11:50 AM, Ján Tomko wrote:
From: liguang <lig.fnst@cn.fujitsu.com>
add a new controller type, then one can define a pci-bridge controller like this: <controller type='pci-bridge' index='0'/>
In the next patch we're prohibiting exactly this config (index='0') because the pre-existing pci bus on the "pc-*" machinetypes is already named pci.0. If we don't allow it, we shouldn't include it as an example in the commit log :-)
NB, it isn't always named 'pci.0' - on many arches it is merely 'pci'.
More on this - one of the things this points out is that there is no representation in the config of the pci.0 bus, it's just assumed to always be there. That is the case for pc-* machinetypes (and probably several others with PCI buses), but for q35, there is no pci.0 bus in the basic machine, only a pcie.0; if you want a pci.0 on q35 (which *will* be necessary in order to attach any pci devices, so I imagine we will always want one), you have to attach a pcie->pci bridge, which is the device "i82801b11-bridge", to pcie.0.
The reason I bring this up here, is I'm wondering:
1) should we have some representation of the default pci.0 bus in the config, even though it is just "always there" for the pc machinetypes and there is no way to disable it, and nothing on the commandline that specifies its existence?
Yep, we should be aiming for the XML to fully describe the machine hardware. So since we're adding the concept of PCI controllers/bridges etc to the XML, we should be auto-adding the default bus to the XML.
2) For the q35 machinetype, should we just always add an i82801b11-bridge device and name it pci.0? Or should that need to be present in the xml?
We've been burnt before auto-adding stuff that ought to have been optional. So I'd tend towards only having the minimal config that is required. If the users want this, let them explicitly ask for the bridge Also from the apps POV the QEMU device name is irrelevant. The XML config works off the PCI addresses. So there's no need to force/specialcase a i82801b11-bridge to use the name 'pci.0'.
3) Most important - depending on the answers to (1) and (2), should we maybe name this device "pci", and use a different backend depending on index and machinetype? (or alternately explicitly specifiable with a <driver> subelement). To be specific, we would have:
<controller type='pci' index='0'/>
which on pc machinetypes would just be a placeholder in the config (and always inserted if it wasn't there, for machinetypes that have a pci bus). On the q35 machinetype, that same line would equate to adding an i82801b11-bridge device (with source defaulting to bus=pcie.0,addr=1e.0). This would serve several purposes:
a) on pc machinetypes, it would be a visual aid indicating that pci.0 exists, and that index='0' isn't available for a new pci controller.
b) it would make switching a domain config from pc to q35 simpler, since pci.0 would always already be in place for attaching pci devices (including pci.1, pci.2, etc)
c) it would make the config a true complete description of the machine being created.
(I've suggested naming the controller "pci" rather than "pci-bridge" because in the case of a "root" bus like pci.0 it seems to not be a "bridge", but maybe the name "pci-bridge" is always appropriate, even when it's a root bus. Maybe someone with better pci/pcie knowledge can provide an opinion on this)
I think "pci" is a little too generic - how about we call it 'pci-root' Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On 04/05/2013 01:38 PM, Daniel P. Berrange wrote:
On Fri, Apr 05, 2013 at 12:32:04PM -0400, Laine Stump wrote:
On 04/03/2013 11:50 AM, Ján Tomko wrote:
From: liguang <lig.fnst@cn.fujitsu.com>
add a new controller type, then one can define a pci-bridge controller like this: <controller type='pci-bridge' index='0'/> In the next patch we're prohibiting exactly this config (index='0') because the pre-existing pci bus on the "pc-*" machinetypes is already named pci.0. If we don't allow it, we shouldn't include it as an example in the commit log :-) NB, it isn't always named 'pci.0' - on many arches it is merely 'pci'.
Yeah, I'm just using that as a convenient shorthand. The final decision on whether to use pci.0 or pci happens down in the qemuBuildCommandline().
More on this - one of the things this points out is that there is no representation in the config of the pci.0 bus, it's just assumed to always be there. That is the case for pc-* machinetypes (and probably several others with PCI buses), but for q35, there is no pci.0 bus in the basic machine, only a pcie.0; if you want a pci.0 on q35 (which *will* be necessary in order to attach any pci devices, so I imagine we will always want one), you have to attach a pcie->pci bridge, which is the device "i82801b11-bridge", to pcie.0. The reason I bring this up here, is I'm wondering:
1) should we have some representation of the default pci.0 bus in the config, even though it is just "always there" for the pc machinetypes and there is no way to disable it, and nothing on the commandline that specifies its existence? Yep, we should be aiming for the XML to fully describe the machine hardware. So since we're adding the concept of PCI controllers/bridges etc to the XML, we should be auto-adding the default bus to the XML.
2) For the q35 machinetype, should we just always add an i82801b11-bridge device and name it pci.0? Or should that need to be present in the xml? We've been burnt before auto-adding stuff that ought to have been optional. So I'd tend towards only having the minimal config that is required. If the users want this, let them explicitly ask for the bridge
Also from the apps POV the QEMU device name is irrelevant. The XML config works off the PCI addresses. So there's no need to force/specialcase a i82801b11-bridge to use the name 'pci.0'.
Sure. I just mean "pci bus 0" (hmm, but actually this does point out a problem with my logic - the same namespace (well, "numbering space") is used for both pcie and pci buses, so on a q35 system, bus=0 is already taken by pcie.0; that means that the first pci bus would need to use a different bus number anyway, so it wouldn't be so easy to switch an existing domain from pc to q35 - every PCI device would need to have its bus number modified. I suppose that's reasonable to expect, though.
3) Most important - depending on the answers to (1) and (2), should we maybe name this device "pci", and use a different backend depending on index and machinetype? (or alternately explicitly specifiable with a <driver> subelement). To be specific, we would have:
<controller type='pci' index='0'/>
which on pc machinetypes would just be a placeholder in the config (and always inserted if it wasn't there, for machinetypes that have a pci bus). On the q35 machinetype, that same line would equate to adding an i82801b11-bridge device (with source defaulting to bus=pcie.0,addr=1e.0). This would serve several purposes:
a) on pc machinetypes, it would be a visual aid indicating that pci.0 exists, and that index='0' isn't available for a new pci controller.
b) it would make switching a domain config from pc to q35 simpler, since pci.0 would always already be in place for attaching pci devices (including pci.1, pci.2, etc)
c) it would make the config a true complete description of the machine being created.
(I've suggested naming the controller "pci" rather than "pci-bridge" because in the case of a "root" bus like pci.0 it seems to not be a "bridge", but maybe the name "pci-bridge" is always appropriate, even when it's a root bus. Maybe someone with better pci/pcie knowledge can provide an opinion on this) I think "pci" is a little too generic - how about we call it 'pci-root'
Okay, so a separate "pci-root" device along with "pci-bridge"? What I was really hoping was to have all PCI buses represented in a common way in the config. How about a controller called "pci" with different types, "root" and "bridge"? And since they use the same numbering space as pcie buses, maybe the pcie controllers (including the root and the hubs and ???) would be different types of PCI controllers. That would make it easier (i.e. *possible*) to avoid collisions in use of bus numbers. Alex or mst, any advice/opinions on how to represent all the different q35 devices that consume bus numbers in a succinct fashion?

On Fri, 2013-04-05 at 14:42 -0400, Laine Stump wrote:
On 04/05/2013 01:38 PM, Daniel P. Berrange wrote:
On Fri, Apr 05, 2013 at 12:32:04PM -0400, Laine Stump wrote:
On 04/03/2013 11:50 AM, Ján Tomko wrote:
From: liguang <lig.fnst@cn.fujitsu.com>
add a new controller type, then one can define a pci-bridge controller like this: <controller type='pci-bridge' index='0'/> In the next patch we're prohibiting exactly this config (index='0') because the pre-existing pci bus on the "pc-*" machinetypes is already named pci.0. If we don't allow it, we shouldn't include it as an example in the commit log :-) NB, it isn't always named 'pci.0' - on many arches it is merely 'pci'.
Yeah, I'm just using that as a convenient shorthand. The final decision on whether to use pci.0 or pci happens down in the qemuBuildCommandline().
More on this - one of the things this points out is that there is no representation in the config of the pci.0 bus, it's just assumed to always be there. That is the case for pc-* machinetypes (and probably several others with PCI buses), but for q35, there is no pci.0 bus in the basic machine, only a pcie.0; if you want a pci.0 on q35 (which *will* be necessary in order to attach any pci devices, so I imagine we will always want one), you have to attach a pcie->pci bridge, which is the device "i82801b11-bridge", to pcie.0. The reason I bring this up here, is I'm wondering:
1) should we have some representation of the default pci.0 bus in the config, even though it is just "always there" for the pc machinetypes and there is no way to disable it, and nothing on the commandline that specifies its existence? Yep, we should be aiming for the XML to fully describe the machine hardware. So since we're adding the concept of PCI controllers/bridges etc to the XML, we should be auto-adding the default bus to the XML.
2) For the q35 machinetype, should we just always add an i82801b11-bridge device and name it pci.0? Or should that need to be present in the xml? We've been burnt before auto-adding stuff that ought to have been optional. So I'd tend towards only having the minimal config that is required. If the users want this, let them explicitly ask for the bridge
Also from the apps POV the QEMU device name is irrelevant. The XML config works off the PCI addresses. So there's no need to force/specialcase a i82801b11-bridge to use the name 'pci.0'.
Sure. I just mean "pci bus 0" (hmm, but actually this does point out a problem with my logic - the same namespace (well, "numbering space") is used for both pcie and pci buses, so on a q35 system, bus=0 is already taken by pcie.0; that means that the first pci bus would need to use a different bus number anyway, so it wouldn't be so easy to switch an existing domain from pc to q35 - every PCI device would need to have its bus number modified. I suppose that's reasonable to expect, though.
I would think you'd want to differentiate PCI from PCIe anyway. PCI is a bus and you have 32 slots per bus to fill. PCIe is a point-to-point link and you really only have slot 0 available. Perhaps that puts them in different number spaces already.
3) Most important - depending on the answers to (1) and (2), should we maybe name this device "pci", and use a different backend depending on index and machinetype? (or alternately explicitly specifiable with a <driver> subelement). To be specific, we would have:
<controller type='pci' index='0'/>
which on pc machinetypes would just be a placeholder in the config (and always inserted if it wasn't there, for machinetypes that have a pci bus). On the q35 machinetype, that same line would equate to adding an i82801b11-bridge device (with source defaulting to bus=pcie.0,addr=1e.0). This would serve several purposes:
a) on pc machinetypes, it would be a visual aid indicating that pci.0 exists, and that index='0' isn't available for a new pci controller.
b) it would make switching a domain config from pc to q35 simpler, since pci.0 would always already be in place for attaching pci devices (including pci.1, pci.2, etc)
c) it would make the config a true complete description of the machine being created.
(I've suggested naming the controller "pci" rather than "pci-bridge" because in the case of a "root" bus like pci.0 it seems to not be a "bridge", but maybe the name "pci-bridge" is always appropriate, even when it's a root bus. Maybe someone with better pci/pcie knowledge can provide an opinion on this) I think "pci" is a little too generic - how about we call it 'pci-root'
Okay, so a separate "pci-root" device along with "pci-bridge"? What I was really hoping was to have all PCI buses represented in a common way in the config. How about a controller called "pci" with different types, "root" and "bridge"? And since they use the same numbering space as pcie buses, maybe the pcie controllers (including the root and the hubs and ???) would be different types of PCI controllers. That would make it easier (i.e. *possible*) to avoid collisions in use of bus numbers.
Alex or mst, any advice/opinions on how to represent all the different q35 devices that consume bus numbers in a succinct fashion?
Note that none of these are really bus numbers, they're just bus identifiers. The BIOS and the guest running define the bus numbers. "root" also has special meaning in PCI, so for instance I wouldn't name a bus behind the i82801b11-bridge "pci-root". Somehow we also need to deal with what can be attached where. For instance a pci-bridge is a PCI device and can only go on a PCI bus. The equivalent structure on PCIe is an upstream switch port with some number of downstream switch ports. Each of those are specific to the bus type. For PCIe, we create new buses for root ports (ioh3420), upstream switch ports (xio3130-upstream), downstream switch ports (xio3130-downstream), and the dmi-to-pci bridge (i82801b11-bridge). For PCI, PCI-to-PCI bridges create new buses (pci-bridge and dec-21154). One of my goals is to move us away from emulation of specific chips and create more devices like pci-bridge that adhere to the standard, but don't try to emulate a specific device. Then we might have "root-port", "pcie-upstream-switch-port", "pcie-downstream-switch-port", and "dmi-to-pci-bridge" (none of these names have been discussed). Thanks, Alex

On 04/05/2013 03:26 PM, Alex Williamson wrote:
On Fri, 2013-04-05 at 14:42 -0400, Laine Stump wrote:
On 04/05/2013 01:38 PM, Daniel P. Berrange wrote:
On Fri, Apr 05, 2013 at 12:32:04PM -0400, Laine Stump wrote:
On 04/03/2013 11:50 AM, Ján Tomko wrote:
From: liguang <lig.fnst@cn.fujitsu.com>
add a new controller type, then one can define a pci-bridge controller like this: <controller type='pci-bridge' index='0'/> In the next patch we're prohibiting exactly this config (index='0') because the pre-existing pci bus on the "pc-*" machinetypes is already named pci.0. If we don't allow it, we shouldn't include it as an example in the commit log :-) NB, it isn't always named 'pci.0' - on many arches it is merely 'pci'. Yeah, I'm just using that as a convenient shorthand. The final decision on whether to use pci.0 or pci happens down in the qemuBuildCommandline().
More on this - one of the things this points out is that there is no representation in the config of the pci.0 bus, it's just assumed to always be there. That is the case for pc-* machinetypes (and probably several others with PCI buses), but for q35, there is no pci.0 bus in the basic machine, only a pcie.0; if you want a pci.0 on q35 (which *will* be necessary in order to attach any pci devices, so I imagine we will always want one), you have to attach a pcie->pci bridge, which is the device "i82801b11-bridge", to pcie.0. The reason I bring this up here, is I'm wondering:
1) should we have some representation of the default pci.0 bus in the config, even though it is just "always there" for the pc machinetypes and there is no way to disable it, and nothing on the commandline that specifies its existence? Yep, we should be aiming for the XML to fully describe the machine hardware. So since we're adding the concept of PCI controllers/bridges etc to the XML, we should be auto-adding the default bus to the XML.
2) For the q35 machinetype, should we just always add an i82801b11-bridge device and name it pci.0? Or should that need to be present in the xml? We've been burnt before auto-adding stuff that ought to have been optional. So I'd tend towards only having the minimal config that is required. If the users want this, let them explicitly ask for the bridge
Okay. This makes for a larger burden on the user/virt-manager/boxes/libvirt-designer, but does prevent us from setting up an undesirable default that we can't rescue ourselves from :-)
Also from the apps POV the QEMU device name is irrelevant. The XML config works off the PCI addresses. So there's no need to force/specialcase a i82801b11-bridge to use the name 'pci.0'.
Sure. I just mean "pci bus 0" (hmm, but actually this does point out a problem with my logic - the same namespace (well, "numbering space") is used for both pcie and pci buses, so on a q35 system, bus=0 is already taken by pcie.0; that means that the first pci bus would need to use a different bus number anyway, so it wouldn't be so easy to switch an existing domain from pc to q35 - every PCI device would need to have its bus number modified. I suppose that's reasonable to expect, though. I would think you'd want to differentiate PCI from PCIe anyway. PCI is a bus and you have 32 slots per bus to fill. PCIe is a point-to-point link and you really only have slot 0 available. Perhaps that puts them in different number spaces already.
Are you saying that it's okay to have a bus=0 for pci and a different bus=0 for pcie? I was hoping that what is used in libvirt's config could mirror as closely as possible the numbering that you see in the output of lspci on the guest, but it sounds like that numbering is something done at the whim of the guest, with no basis in (standard) reality, is that right?
3) Most important - depending on the answers to (1) and (2), should we maybe name this device "pci", and use a different backend depending on index and machinetype? (or alternately explicitly specifiable with a <driver> subelement). To be specific, we would have:
<controller type='pci' index='0'/>
which on pc machinetypes would just be a placeholder in the config (and always inserted if it wasn't there, for machinetypes that have a pci bus). On the q35 machinetype, that same line would equate to adding an i82801b11-bridge device (with source defaulting to bus=pcie.0,addr=1e.0). This would serve several purposes:
a) on pc machinetypes, it would be a visual aid indicating that pci.0 exists, and that index='0' isn't available for a new pci controller.
b) it would make switching a domain config from pc to q35 simpler, since pci.0 would always already be in place for attaching pci devices (including pci.1, pci.2, etc)
c) it would make the config a true complete description of the machine being created.
(I've suggested naming the controller "pci" rather than "pci-bridge" because in the case of a "root" bus like pci.0 it seems to not be a "bridge", but maybe the name "pci-bridge" is always appropriate, even when it's a root bus. Maybe someone with better pci/pcie knowledge can provide an opinion on this) I think "pci" is a little too generic - how about we call it 'pci-root' Okay, so a separate "pci-root" device along with "pci-bridge"? What I was really hoping was to have all PCI buses represented in a common way in the config. How about a controller called "pci" with different types, "root" and "bridge"? And since they use the same numbering space as pcie buses, maybe the pcie controllers (including the root and the hubs and ???) would be different types of PCI controllers. That would make it easier (i.e. *possible*) to avoid collisions in use of bus numbers.
Alex or mst, any advice/opinions on how to represent all the different q35 devices that consume bus numbers in a succinct fashion? Note that none of these are really bus numbers, they're just bus identifiers. The BIOS and the guest running define the bus numbers. "root" also has special meaning in PCI, so for instance I wouldn't name a bus behind the i82801b11-bridge "pci-root". Somehow we also need to deal with what can be attached where. For instance a pci-bridge is a PCI device and can only go on a PCI bus. The equivalent structure on PCIe is an upstream switch port with some number of downstream switch ports. Each of those are specific to the bus type.
I think we're starting to get closer to the concrete problem that's bothering me. As I understand it (and again - "what I understand" has repeatedly been shown to be incorrect in this thread :-): * Ihere are multiple different types of devices that provide a bus with 1 or more "slots" that PCI devices (e.g., the virtio-net-pci device, the e1000 network device, etc) can be plugged into. * In the config for those devices, there is a required (auto-generated if not explicitly provided) <address> element that indicates what controller that device is plugged into e.g.: <interface type='direct'> ... <address type='pci' domain='0' bus='0' slot='3' function='0'/> ... </interface> * domain is always hardcoded to 0, and in the past bus was also always hardcoded to 0 because until now there has only been a single place where PCI devices could be connected - the builtin pci.0 bus, which is a part of the basic "pc" (and some others) virtual machine and provides 32 slots. * Now we are adding the ability to define new PCI buses, for now just a single kind - a pci-bridge controller, which itself must connect to an existing PCI slot, and provides 32 new PCI slots. But in the future there will be more different types of controllers that provide one or more PCI slots where PCI devices/controllers can be plugged in. * In these patches adding support for pci-bridge, we are making the assumption that there is a 1:1 correspondence between the "index='n'" attribute of the pci-bridge controller and the "bus='n'" attribute of the <address> element in devices that will be plugged into that controller. So for example if we have: <controller type='pci-bridge' index='1'> <address type='pci' domain='0' bus='0' slot='10' function='0'/> </controller> and then change the <interface> definition above to say "bus='1'", that interface device will plug into this new bus at slot 3. * So let's assume that we add a new controller called "dmi-to-pci-bridge: <controller type='dmi-to-pci-bridge' index='0'/> Ignoring for now the question of what address we give in the definition of *this* device (which is itself problematic - do we need a new "pcie" address type?), if some device is then defined with <address type='pci bus='0' .../> How do we differentiate between that meaning "the pci-ptp controller that is index='0'" and "the pci-bridge controller that is index='0'"? Do we need to expand our <address> element further? If, as I think you suggest, we have multiple different kinds of controllers that provide PCI slots, each with its own namespace, the current pci address element is inadequate to unambiguously describe where a pci device should be plugged in. Perhaps we should be referencing the "<alias name='nnn'/>" element of each controller in the pci address of the target device, e.g.: <controller type='pci-bridge' index='0'> <alias name='pci.0'/> <!-- obviously on a machine with no builtin pci.0! --> </controller/> <controller type='dmi-to-pci-bridge' index='0'> <alias name='dmi-to-pci-bridge.0'/> </controller> <interface type='direct'> ... <address type='pci' controller='dmi-to-pci-bridge.0' slot='3' function='0'/> </interface> (or, since this "controller" attribute really obsolates the numeric "bus" attribute, maybe it could be "bus='dmi-to-pci-bridge.0'", and we could continue to support "bus='0'" for legacy configs). I believe right now the alias name is always auto-generated; we would need to make that so that when explicitly provided it would be guaranteed to never change (and if that's not possible to do in a backward compatible manner, then we need to come up with some new attribute to use in this manner) Alternately, we could add new types to address, one for each new type of controller, then define the devices like this: <interface type='direct'> <address type='pci-bridge' bus='0' slot='3' function='0'/> <interface <interface type='direct'> <address type='dmi-to-pci-bridge' bus='0' slot='3' function='0'/> </interface> (yes, I know you wouldn't want to plug a network device into the dmi-to-pci-bridge directly, this is just for the sake of example) You'll notice that this makes the bus attribute obsolete. (side note: I know that this discussion has gone far beyond just talking about adding a single new type of controller (pci-bridge), but how we do this device will have implications far beyond, so we need to figure it out now.)
For PCIe, we create new buses for root ports (ioh3420), upstream switch ports (xio3130-upstream), downstream switch ports (xio3130-downstream), and the dmi-to-pci bridge (i82801b11-bridge). For PCI, PCI-to-PCI bridges create new buses (pci-bridge and dec-21154).
One of my goals is to move us away from emulation of specific chips and create more devices like pci-bridge that adhere to the standard, but don't try to emulate a specific device. Then we might have "root-port", "pcie-upstream-switch-port", "pcie-downstream-switch-port", and "dmi-to-pci-bridge" (none of these names have been discussed).
That makes sense to me at the level of libvirt, but in qemu don't you need to "emulate specific devices" anyway, in order for the guest OS to operate properly? If that's the case and there are different chips that implement the same functionality in a different manner, how would you decide which of those should be chosen as "the *only" dmi-to-pci-bridge"?

On Mon, Apr 08, 2013 at 12:37:49PM -0400, Laine Stump wrote:
On 04/05/2013 03:26 PM, Alex Williamson wrote: I think we're starting to get closer to the concrete problem that's bothering me. As I understand it (and again - "what I understand" has repeatedly been shown to be incorrect in this thread :-):
* Ihere are multiple different types of devices that provide a bus with 1 or more "slots" that PCI devices (e.g., the virtio-net-pci device, the e1000 network device, etc) can be plugged into.
* In the config for those devices, there is a required (auto-generated if not explicitly provided) <address> element that indicates what controller that device is plugged into e.g.:
<interface type='direct'> ... <address type='pci' domain='0' bus='0' slot='3' function='0'/> ... </interface>
* domain is always hardcoded to 0, and in the past bus was also always hardcoded to 0 because until now there has only been a single place where PCI devices could be connected - the builtin pci.0 bus, which is a part of the basic "pc" (and some others) virtual machine and provides 32 slots.
* Now we are adding the ability to define new PCI buses, for now just a single kind - a pci-bridge controller, which itself must connect to an existing PCI slot, and provides 32 new PCI slots. But in the future there will be more different types of controllers that provide one or more PCI slots where PCI devices/controllers can be plugged in.
* In these patches adding support for pci-bridge, we are making the assumption that there is a 1:1 correspondence between the "index='n'" attribute of the pci-bridge controller and the "bus='n'" attribute of the <address> element in devices that will be plugged into that controller. So for example if we have:
<controller type='pci-bridge' index='1'> <address type='pci' domain='0' bus='0' slot='10' function='0'/> </controller>
and then change the <interface> definition above to say "bus='1'", that interface device will plug into this new bus at slot 3.
* So let's assume that we add a new controller called "dmi-to-pci-bridge":
<controller type='dmi-to-pci-bridge' index='0'/>
Ignoring for now the question of what address we give in the definition of *this* device (which is itself problematic - do we need a new "pcie" address type?), if some device is then defined with
<address type='pci bus='0' .../>
How do we differentiate between that meaning "the pci-ptp controller that is index='0'" and "the pci-bridge controller that is index='0'"? Do we need to expand our <address> element further? If, as I think you suggest, we have multiple different kinds of controllers that provide PCI slots, each with its own namespace, the current pci address element is inadequate to unambiguously describe where a pci device should be plugged in.
Hmm yes, you're right - as long as we only have <adress type='pci'> then all <controller> elements should use type='pci' too, and we should just distinguish based on the model name of the controller. So ignore my previous suggestion to have 'pci-bridge' and 'pci-root' types, we can only use type='pci' on <controller> elements. Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On 04/08/2013 12:48 PM, Daniel P. Berrange wrote:
I think we're starting to get closer to the concrete problem that's bothering me. As I understand it (and again - "what I understand" has repeatedly been shown to be incorrect in this thread :-):
* Ihere are multiple different types of devices that provide a bus with 1 or more "slots" that PCI devices (e.g., the virtio-net-pci device, the e1000 network device, etc) can be plugged into.
* In the config for those devices, there is a required (auto-generated if not explicitly provided) <address> element that indicates what controller that device is plugged into e.g.:
<interface type='direct'> ... <address type='pci' domain='0' bus='0' slot='3' function='0'/> ... </interface>
* domain is always hardcoded to 0, and in the past bus was also always hardcoded to 0 because until now there has only been a single place where PCI devices could be connected - the builtin pci.0 bus, which is a part of the basic "pc" (and some others) virtual machine and provides 32 slots.
* Now we are adding the ability to define new PCI buses, for now just a single kind - a pci-bridge controller, which itself must connect to an existing PCI slot, and provides 32 new PCI slots. But in the future there will be more different types of controllers that provide one or more PCI slots where PCI devices/controllers can be plugged in.
* In these patches adding support for pci-bridge, we are making the assumption that there is a 1:1 correspondence between the "index='n'" attribute of the pci-bridge controller and the "bus='n'" attribute of the <address> element in devices that will be plugged into that controller. So for example if we have:
<controller type='pci-bridge' index='1'> <address type='pci' domain='0' bus='0' slot='10' function='0'/> </controller>
and then change the <interface> definition above to say "bus='1'", that interface device will plug into this new bus at slot 3.
* So let's assume that we add a new controller called "dmi-to-pci-bridge":
<controller type='dmi-to-pci-bridge' index='0'/>
Ignoring for now the question of what address we give in the definition of *this* device (which is itself problematic - do we need a new "pcie" address type?), if some device is then defined with
<address type='pci bus='0' .../>
How do we differentiate between that meaning "the pci-ptp controller that is index='0'" and "the pci-bridge controller that is index='0'"? Do we need to expand our <address> element further? If, as I think you suggest, we have multiple different kinds of controllers that provide PCI slots, each with its own namespace, the current pci address element is inadequate to unambiguously describe where a pci device should be plugged in. Hmm yes, you're right - as long as we only have <adress type='pci'>
On Mon, Apr 08, 2013 at 12:37:49PM -0400, Laine Stump wrote: then all <controller> elements should use type='pci' too, and we should just distinguish based on the model name of the controller. So ignore my previous suggestion to have 'pci-bridge' and 'pci-root' types, we can only use type='pci' on <controller> elements.
Okay, so that means we preserve the correlation between <controller type='pci' index='1'> and <address type='pci' bus='1' ..../> Should the <controller> device use, e.g. <model type='pci-bridge'/> for the model, as is done for <interface> devices? One notable difference is that in the case of <interface> (with the exception of "<model type='virtio'/>"), the model isn't used for anything except passing directly through to qemu (and very recently validating against a list of known interface models), while in the case of controllers with type='pci', different models will have different rules about what they can connect to and what can connect to them, and they will affect what is valid in other devices. An example on a "pc" machinetype that has the builtin PCI bus, one extra pci-pci bridge, and an interface device plugged into slot 3 of the pci-bridge: <controller type='pci' index='0'> <model type='pci-root'/> <!-- builtin pci bus --> </controller> <controller type='pci' index='1'> <model type='pci-bridge'/> </controller> <interface type='direct'> ... <address type='pci' bus='1' slot='3'/> </controller> And for a q35 machinetype that has the root pcie, an i82801b11-bridge connected to slot 1e of that, a pci bridge connected to slot 1 of the i82801b11-bridge, and an interface plugged into slot 3 of the pci-bridge: <controller type='pci' index='0'> <model type='pcie-root'/> </controller> <controller type='pci' index='1'> <model type='i82801b11-bridge'/> <!-- [*] --> <address type='pci' bus='0' slot='0x1e'/> </controller> <controller type='pci' index='2'> <model type='pci-bridge'/> <address type='pci' bus='1' slot='1'/> </controller> <interface type='direct'> ... <address type='pci' bus='2' slot='3'/> </controller> (note that controllers with model='(pci|pcie)-root' will not have any <address> element, because they exist in the basic machine so we don't need to connect them to anywhere.) (also note that it might happen that the bus number in libvirt's config will correspond to the bus numbering that shows up in the guest OS, but that will just be a happy coincidence) Does this make sense?

On Mon, Apr 08, 2013 at 03:32:07PM -0400, Laine Stump wrote:
On 04/08/2013 12:48 PM, Daniel P. Berrange wrote:
I think we're starting to get closer to the concrete problem that's bothering me. As I understand it (and again - "what I understand" has repeatedly been shown to be incorrect in this thread :-):
* Ihere are multiple different types of devices that provide a bus with 1 or more "slots" that PCI devices (e.g., the virtio-net-pci device, the e1000 network device, etc) can be plugged into.
* In the config for those devices, there is a required (auto-generated if not explicitly provided) <address> element that indicates what controller that device is plugged into e.g.:
<interface type='direct'> ... <address type='pci' domain='0' bus='0' slot='3' function='0'/> ... </interface>
* domain is always hardcoded to 0, and in the past bus was also always hardcoded to 0 because until now there has only been a single place where PCI devices could be connected - the builtin pci.0 bus, which is a part of the basic "pc" (and some others) virtual machine and provides 32 slots.
* Now we are adding the ability to define new PCI buses, for now just a single kind - a pci-bridge controller, which itself must connect to an existing PCI slot, and provides 32 new PCI slots. But in the future there will be more different types of controllers that provide one or more PCI slots where PCI devices/controllers can be plugged in.
* In these patches adding support for pci-bridge, we are making the assumption that there is a 1:1 correspondence between the "index='n'" attribute of the pci-bridge controller and the "bus='n'" attribute of the <address> element in devices that will be plugged into that controller. So for example if we have:
<controller type='pci-bridge' index='1'> <address type='pci' domain='0' bus='0' slot='10' function='0'/> </controller>
and then change the <interface> definition above to say "bus='1'", that interface device will plug into this new bus at slot 3.
* So let's assume that we add a new controller called "dmi-to-pci-bridge":
<controller type='dmi-to-pci-bridge' index='0'/>
Ignoring for now the question of what address we give in the definition of *this* device (which is itself problematic - do we need a new "pcie" address type?), if some device is then defined with
<address type='pci bus='0' .../>
How do we differentiate between that meaning "the pci-ptp controller that is index='0'" and "the pci-bridge controller that is index='0'"? Do we need to expand our <address> element further? If, as I think you suggest, we have multiple different kinds of controllers that provide PCI slots, each with its own namespace, the current pci address element is inadequate to unambiguously describe where a pci device should be plugged in. Hmm yes, you're right - as long as we only have <adress type='pci'>
On Mon, Apr 08, 2013 at 12:37:49PM -0400, Laine Stump wrote: then all <controller> elements should use type='pci' too, and we should just distinguish based on the model name of the controller. So ignore my previous suggestion to have 'pci-bridge' and 'pci-root' types, we can only use type='pci' on <controller> elements.
Okay, so that means we preserve the correlation between
<controller type='pci' index='1'>
and
<address type='pci' bus='1' ..../>
Should the <controller> device use, e.g. <model type='pci-bridge'/> for the model, as is done for <interface> devices? One notable difference is that in the case of <interface> (with the exception of "<model type='virtio'/>"), the model isn't used for anything except passing directly through to qemu (and very recently validating against a list of known interface models), while in the case of controllers with type='pci', different models will have different rules about what they can connect to and what can connect to them, and they will affect what is valid in other devices.
An example on a "pc" machinetype that has the builtin PCI bus, one extra pci-pci bridge, and an interface device plugged into slot 3 of the pci-bridge:
<controller type='pci' index='0'> <model type='pci-root'/> <!-- builtin pci bus --> </controller> <controller type='pci' index='1'> <model type='pci-bridge'/> </controller> <interface type='direct'> ... <address type='pci' bus='1' slot='3'/> </controller>
And for a q35 machinetype that has the root pcie, an i82801b11-bridge connected to slot 1e of that, a pci bridge connected to slot 1 of the i82801b11-bridge, and an interface plugged into slot 3 of the pci-bridge:
<controller type='pci' index='0'> <model type='pcie-root'/> </controller> <controller type='pci' index='1'> <model type='i82801b11-bridge'/> <!-- [*] --> <address type='pci' bus='0' slot='0x1e'/> </controller> <controller type='pci' index='2'> <model type='pci-bridge'/> <address type='pci' bus='1' slot='1'/> </controller> <interface type='direct'> ... <address type='pci' bus='2' slot='3'/> </controller>
(note that controllers with model='(pci|pcie)-root' will not have any <address> element, because they exist in the basic machine so we don't need to connect them to anywhere.)
(also note that it might happen that the bus number in libvirt's config will correspond to the bus numbering that shows up in the guest OS, but that will just be a happy coincidence)
Does this make sense?
Confused. So why are you using bus numbers at all? It's just wrong. -- MST

On 04/08/2013 03:37 PM, Michael S. Tsirkin wrote:
On Mon, Apr 08, 2013 at 03:32:07PM -0400, Laine Stump wrote:
On 04/08/2013 12:48 PM, Daniel P. Berrange wrote:
I think we're starting to get closer to the concrete problem that's bothering me. As I understand it (and again - "what I understand" has repeatedly been shown to be incorrect in this thread :-):
* Ihere are multiple different types of devices that provide a bus with 1 or more "slots" that PCI devices (e.g., the virtio-net-pci device, the e1000 network device, etc) can be plugged into.
* In the config for those devices, there is a required (auto-generated if not explicitly provided) <address> element that indicates what controller that device is plugged into e.g.:
<interface type='direct'> ... <address type='pci' domain='0' bus='0' slot='3' function='0'/> ... </interface>
* domain is always hardcoded to 0, and in the past bus was also always hardcoded to 0 because until now there has only been a single place where PCI devices could be connected - the builtin pci.0 bus, which is a part of the basic "pc" (and some others) virtual machine and provides 32 slots.
* Now we are adding the ability to define new PCI buses, for now just a single kind - a pci-bridge controller, which itself must connect to an existing PCI slot, and provides 32 new PCI slots. But in the future there will be more different types of controllers that provide one or more PCI slots where PCI devices/controllers can be plugged in.
* In these patches adding support for pci-bridge, we are making the assumption that there is a 1:1 correspondence between the "index='n'" attribute of the pci-bridge controller and the "bus='n'" attribute of the <address> element in devices that will be plugged into that controller. So for example if we have:
<controller type='pci-bridge' index='1'> <address type='pci' domain='0' bus='0' slot='10' function='0'/> </controller>
and then change the <interface> definition above to say "bus='1'", that interface device will plug into this new bus at slot 3.
* So let's assume that we add a new controller called "dmi-to-pci-bridge":
<controller type='dmi-to-pci-bridge' index='0'/>
Ignoring for now the question of what address we give in the definition of *this* device (which is itself problematic - do we need a new "pcie" address type?), if some device is then defined with
<address type='pci bus='0' .../>
How do we differentiate between that meaning "the pci-ptp controller that is index='0'" and "the pci-bridge controller that is index='0'"? Do we need to expand our <address> element further? If, as I think you suggest, we have multiple different kinds of controllers that provide PCI slots, each with its own namespace, the current pci address element is inadequate to unambiguously describe where a pci device should be plugged in. Hmm yes, you're right - as long as we only have <adress type='pci'>
On Mon, Apr 08, 2013 at 12:37:49PM -0400, Laine Stump wrote: then all <controller> elements should use type='pci' too, and we should just distinguish based on the model name of the controller. So ignore my previous suggestion to have 'pci-bridge' and 'pci-root' types, we can only use type='pci' on <controller> elements. Okay, so that means we preserve the correlation between
<controller type='pci' index='1'>
and
<address type='pci' bus='1' ..../>
Should the <controller> device use, e.g. <model type='pci-bridge'/> for the model, as is done for <interface> devices? One notable difference is that in the case of <interface> (with the exception of "<model type='virtio'/>"), the model isn't used for anything except passing directly through to qemu (and very recently validating against a list of known interface models), while in the case of controllers with type='pci', different models will have different rules about what they can connect to and what can connect to them, and they will affect what is valid in other devices.
An example on a "pc" machinetype that has the builtin PCI bus, one extra pci-pci bridge, and an interface device plugged into slot 3 of the pci-bridge:
<controller type='pci' index='0'> <model type='pci-root'/> <!-- builtin pci bus --> </controller> <controller type='pci' index='1'> <model type='pci-bridge'/> </controller> <interface type='direct'> ... <address type='pci' bus='1' slot='3'/> </controller>
And for a q35 machinetype that has the root pcie, an i82801b11-bridge connected to slot 1e of that, a pci bridge connected to slot 1 of the i82801b11-bridge, and an interface plugged into slot 3 of the pci-bridge:
<controller type='pci' index='0'> <model type='pcie-root'/> </controller> <controller type='pci' index='1'> <model type='i82801b11-bridge'/> <!-- [*] --> <address type='pci' bus='0' slot='0x1e'/> </controller> <controller type='pci' index='2'> <model type='pci-bridge'/> <address type='pci' bus='1' slot='1'/> </controller> <interface type='direct'> ... <address type='pci' bus='2' slot='3'/> </controller>
(note that controllers with model='(pci|pcie)-root' will not have any <address> element, because they exist in the basic machine so we don't need to connect them to anywhere.)
(also note that it might happen that the bus number in libvirt's config will correspond to the bus numbering that shows up in the guest OS, but that will just be a happy coincidence)
Does this make sense? Confused. So why are you using bus numbers at all?
I'm kind of wondering that too :-) That's why I suggested naming each controller that provided a connection point/bus and using the name instead of a number. I think initially the pci address element was given a bus number because it made the config in libvirt look similar to the view of the devices from the guest (and also that's what is necessary for a PCI address on the host side, e.g. when doing PCI passthrough); maybe now we're only considering keeping it for backwards compatibility and consistency with the hostside address element? Or does using numbers fit better with the paradigms for other hypervisors? (doesn't seem like that should be a problem, because the name can be completely internal to libvirt, and translated into what each different hypervisor needs to have).

On Mon, Apr 08, 2013 at 10:37:45PM +0300, Michael S. Tsirkin wrote:
On Mon, Apr 08, 2013 at 03:32:07PM -0400, Laine Stump wrote:
On 04/08/2013 12:48 PM, Daniel P. Berrange wrote:
I think we're starting to get closer to the concrete problem that's bothering me. As I understand it (and again - "what I understand" has repeatedly been shown to be incorrect in this thread :-):
* Ihere are multiple different types of devices that provide a bus with 1 or more "slots" that PCI devices (e.g., the virtio-net-pci device, the e1000 network device, etc) can be plugged into.
* In the config for those devices, there is a required (auto-generated if not explicitly provided) <address> element that indicates what controller that device is plugged into e.g.:
<interface type='direct'> ... <address type='pci' domain='0' bus='0' slot='3' function='0'/> ... </interface>
* domain is always hardcoded to 0, and in the past bus was also always hardcoded to 0 because until now there has only been a single place where PCI devices could be connected - the builtin pci.0 bus, which is a part of the basic "pc" (and some others) virtual machine and provides 32 slots.
* Now we are adding the ability to define new PCI buses, for now just a single kind - a pci-bridge controller, which itself must connect to an existing PCI slot, and provides 32 new PCI slots. But in the future there will be more different types of controllers that provide one or more PCI slots where PCI devices/controllers can be plugged in.
* In these patches adding support for pci-bridge, we are making the assumption that there is a 1:1 correspondence between the "index='n'" attribute of the pci-bridge controller and the "bus='n'" attribute of the <address> element in devices that will be plugged into that controller. So for example if we have:
<controller type='pci-bridge' index='1'> <address type='pci' domain='0' bus='0' slot='10' function='0'/> </controller>
and then change the <interface> definition above to say "bus='1'", that interface device will plug into this new bus at slot 3.
* So let's assume that we add a new controller called "dmi-to-pci-bridge":
<controller type='dmi-to-pci-bridge' index='0'/>
Ignoring for now the question of what address we give in the definition of *this* device (which is itself problematic - do we need a new "pcie" address type?), if some device is then defined with
<address type='pci bus='0' .../>
How do we differentiate between that meaning "the pci-ptp controller that is index='0'" and "the pci-bridge controller that is index='0'"? Do we need to expand our <address> element further? If, as I think you suggest, we have multiple different kinds of controllers that provide PCI slots, each with its own namespace, the current pci address element is inadequate to unambiguously describe where a pci device should be plugged in. Hmm yes, you're right - as long as we only have <adress type='pci'>
On Mon, Apr 08, 2013 at 12:37:49PM -0400, Laine Stump wrote: then all <controller> elements should use type='pci' too, and we should just distinguish based on the model name of the controller. So ignore my previous suggestion to have 'pci-bridge' and 'pci-root' types, we can only use type='pci' on <controller> elements.
Okay, so that means we preserve the correlation between
<controller type='pci' index='1'>
and
<address type='pci' bus='1' ..../>
Should the <controller> device use, e.g. <model type='pci-bridge'/> for the model, as is done for <interface> devices? One notable difference is that in the case of <interface> (with the exception of "<model type='virtio'/>"), the model isn't used for anything except passing directly through to qemu (and very recently validating against a list of known interface models), while in the case of controllers with type='pci', different models will have different rules about what they can connect to and what can connect to them, and they will affect what is valid in other devices.
An example on a "pc" machinetype that has the builtin PCI bus, one extra pci-pci bridge, and an interface device plugged into slot 3 of the pci-bridge:
<controller type='pci' index='0'> <model type='pci-root'/> <!-- builtin pci bus --> </controller> <controller type='pci' index='1'> <model type='pci-bridge'/> </controller> <interface type='direct'> ... <address type='pci' bus='1' slot='3'/> </controller>
And for a q35 machinetype that has the root pcie, an i82801b11-bridge connected to slot 1e of that, a pci bridge connected to slot 1 of the i82801b11-bridge, and an interface plugged into slot 3 of the pci-bridge:
<controller type='pci' index='0'> <model type='pcie-root'/> </controller> <controller type='pci' index='1'> <model type='i82801b11-bridge'/> <!-- [*] --> <address type='pci' bus='0' slot='0x1e'/> </controller> <controller type='pci' index='2'> <model type='pci-bridge'/> <address type='pci' bus='1' slot='1'/> </controller> <interface type='direct'> ... <address type='pci' bus='2' slot='3'/> </controller>
(note that controllers with model='(pci|pcie)-root' will not have any <address> element, because they exist in the basic machine so we don't need to connect them to anywhere.)
(also note that it might happen that the bus number in libvirt's config will correspond to the bus numbering that shows up in the guest OS, but that will just be a happy coincidence)
Does this make sense?
Confused. So why are you using bus numbers at all? It's just wrong.
They are not wrong. We use to link the <address> element to the <controller> element. Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On Tue, Apr 09, 2013 at 09:59:50AM +0100, Daniel P. Berrange wrote:
On Mon, Apr 08, 2013 at 10:37:45PM +0300, Michael S. Tsirkin wrote:
On Mon, Apr 08, 2013 at 03:32:07PM -0400, Laine Stump wrote:
On 04/08/2013 12:48 PM, Daniel P. Berrange wrote:
I think we're starting to get closer to the concrete problem that's bothering me. As I understand it (and again - "what I understand" has repeatedly been shown to be incorrect in this thread :-):
* Ihere are multiple different types of devices that provide a bus with 1 or more "slots" that PCI devices (e.g., the virtio-net-pci device, the e1000 network device, etc) can be plugged into.
* In the config for those devices, there is a required (auto-generated if not explicitly provided) <address> element that indicates what controller that device is plugged into e.g.:
<interface type='direct'> ... <address type='pci' domain='0' bus='0' slot='3' function='0'/> ... </interface>
* domain is always hardcoded to 0, and in the past bus was also always hardcoded to 0 because until now there has only been a single place where PCI devices could be connected - the builtin pci.0 bus, which is a part of the basic "pc" (and some others) virtual machine and provides 32 slots.
* Now we are adding the ability to define new PCI buses, for now just a single kind - a pci-bridge controller, which itself must connect to an existing PCI slot, and provides 32 new PCI slots. But in the future there will be more different types of controllers that provide one or more PCI slots where PCI devices/controllers can be plugged in.
* In these patches adding support for pci-bridge, we are making the assumption that there is a 1:1 correspondence between the "index='n'" attribute of the pci-bridge controller and the "bus='n'" attribute of the <address> element in devices that will be plugged into that controller. So for example if we have:
<controller type='pci-bridge' index='1'> <address type='pci' domain='0' bus='0' slot='10' function='0'/> </controller>
and then change the <interface> definition above to say "bus='1'", that interface device will plug into this new bus at slot 3.
* So let's assume that we add a new controller called "dmi-to-pci-bridge":
<controller type='dmi-to-pci-bridge' index='0'/>
Ignoring for now the question of what address we give in the definition of *this* device (which is itself problematic - do we need a new "pcie" address type?), if some device is then defined with
<address type='pci bus='0' .../>
How do we differentiate between that meaning "the pci-ptp controller that is index='0'" and "the pci-bridge controller that is index='0'"? Do we need to expand our <address> element further? If, as I think you suggest, we have multiple different kinds of controllers that provide PCI slots, each with its own namespace, the current pci address element is inadequate to unambiguously describe where a pci device should be plugged in. Hmm yes, you're right - as long as we only have <adress type='pci'>
On Mon, Apr 08, 2013 at 12:37:49PM -0400, Laine Stump wrote: then all <controller> elements should use type='pci' too, and we should just distinguish based on the model name of the controller. So ignore my previous suggestion to have 'pci-bridge' and 'pci-root' types, we can only use type='pci' on <controller> elements.
Okay, so that means we preserve the correlation between
<controller type='pci' index='1'>
and
<address type='pci' bus='1' ..../>
Should the <controller> device use, e.g. <model type='pci-bridge'/> for the model, as is done for <interface> devices? One notable difference is that in the case of <interface> (with the exception of "<model type='virtio'/>"), the model isn't used for anything except passing directly through to qemu (and very recently validating against a list of known interface models), while in the case of controllers with type='pci', different models will have different rules about what they can connect to and what can connect to them, and they will affect what is valid in other devices.
An example on a "pc" machinetype that has the builtin PCI bus, one extra pci-pci bridge, and an interface device plugged into slot 3 of the pci-bridge:
<controller type='pci' index='0'> <model type='pci-root'/> <!-- builtin pci bus --> </controller> <controller type='pci' index='1'> <model type='pci-bridge'/> </controller> <interface type='direct'> ... <address type='pci' bus='1' slot='3'/> </controller>
And for a q35 machinetype that has the root pcie, an i82801b11-bridge connected to slot 1e of that, a pci bridge connected to slot 1 of the i82801b11-bridge, and an interface plugged into slot 3 of the pci-bridge:
<controller type='pci' index='0'> <model type='pcie-root'/> </controller> <controller type='pci' index='1'> <model type='i82801b11-bridge'/> <!-- [*] --> <address type='pci' bus='0' slot='0x1e'/> </controller> <controller type='pci' index='2'> <model type='pci-bridge'/> <address type='pci' bus='1' slot='1'/> </controller> <interface type='direct'> ... <address type='pci' bus='2' slot='3'/> </controller>
(note that controllers with model='(pci|pcie)-root' will not have any <address> element, because they exist in the basic machine so we don't need to connect them to anywhere.)
(also note that it might happen that the bus number in libvirt's config will correspond to the bus numbering that shows up in the guest OS, but that will just be a happy coincidence)
Does this make sense?
Confused. So why are you using bus numbers at all? It's just wrong.
They are not wrong. We use to link the <address> element to the <controller> element.
Daniel
I see so it's just a number, not bus number in pci sense? Maybe name it bus_id or something.
-- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On Mon, Apr 08, 2013 at 03:32:07PM -0400, Laine Stump wrote:
On 04/08/2013 12:48 PM, Daniel P. Berrange wrote:
I think we're starting to get closer to the concrete problem that's bothering me. As I understand it (and again - "what I understand" has repeatedly been shown to be incorrect in this thread :-):
* Ihere are multiple different types of devices that provide a bus with 1 or more "slots" that PCI devices (e.g., the virtio-net-pci device, the e1000 network device, etc) can be plugged into.
* In the config for those devices, there is a required (auto-generated if not explicitly provided) <address> element that indicates what controller that device is plugged into e.g.:
<interface type='direct'> ... <address type='pci' domain='0' bus='0' slot='3' function='0'/> ... </interface>
* domain is always hardcoded to 0, and in the past bus was also always hardcoded to 0 because until now there has only been a single place where PCI devices could be connected - the builtin pci.0 bus, which is a part of the basic "pc" (and some others) virtual machine and provides 32 slots.
* Now we are adding the ability to define new PCI buses, for now just a single kind - a pci-bridge controller, which itself must connect to an existing PCI slot, and provides 32 new PCI slots. But in the future there will be more different types of controllers that provide one or more PCI slots where PCI devices/controllers can be plugged in.
* In these patches adding support for pci-bridge, we are making the assumption that there is a 1:1 correspondence between the "index='n'" attribute of the pci-bridge controller and the "bus='n'" attribute of the <address> element in devices that will be plugged into that controller. So for example if we have:
<controller type='pci-bridge' index='1'> <address type='pci' domain='0' bus='0' slot='10' function='0'/> </controller>
and then change the <interface> definition above to say "bus='1'", that interface device will plug into this new bus at slot 3.
* So let's assume that we add a new controller called "dmi-to-pci-bridge":
<controller type='dmi-to-pci-bridge' index='0'/>
Ignoring for now the question of what address we give in the definition of *this* device (which is itself problematic - do we need a new "pcie" address type?), if some device is then defined with
<address type='pci bus='0' .../>
How do we differentiate between that meaning "the pci-ptp controller that is index='0'" and "the pci-bridge controller that is index='0'"? Do we need to expand our <address> element further? If, as I think you suggest, we have multiple different kinds of controllers that provide PCI slots, each with its own namespace, the current pci address element is inadequate to unambiguously describe where a pci device should be plugged in. Hmm yes, you're right - as long as we only have <adress type='pci'>
On Mon, Apr 08, 2013 at 12:37:49PM -0400, Laine Stump wrote: then all <controller> elements should use type='pci' too, and we should just distinguish based on the model name of the controller. So ignore my previous suggestion to have 'pci-bridge' and 'pci-root' types, we can only use type='pci' on <controller> elements.
Okay, so that means we preserve the correlation between
<controller type='pci' index='1'>
and
<address type='pci' bus='1' ..../>
Should the <controller> device use, e.g. <model type='pci-bridge'/> for the model, as is done for <interface> devices? One notable difference is that in the case of <interface> (with the exception of "<model type='virtio'/>"), the model isn't used for anything except passing directly through to qemu (and very recently validating against a list of known interface models), while in the case of controllers with type='pci', different models will have different rules about what they can connect to and what can connect to them, and they will affect what is valid in other devices.
An example on a "pc" machinetype that has the builtin PCI bus, one extra pci-pci bridge, and an interface device plugged into slot 3 of the pci-bridge:
<controller type='pci' index='0'> <model type='pci-root'/> <!-- builtin pci bus --> </controller> <controller type='pci' index='1'> <model type='pci-bridge'/> </controller> <interface type='direct'> ... <address type='pci' bus='1' slot='3'/> </controller>
And for a q35 machinetype that has the root pcie, an i82801b11-bridge connected to slot 1e of that, a pci bridge connected to slot 1 of the i82801b11-bridge, and an interface plugged into slot 3 of the pci-bridge:
<controller type='pci' index='0'> <model type='pcie-root'/> </controller> <controller type='pci' index='1'> <model type='i82801b11-bridge'/> <!-- [*] --> <address type='pci' bus='0' slot='0x1e'/> </controller> <controller type='pci' index='2'> <model type='pci-bridge'/> <address type='pci' bus='1' slot='1'/> </controller> <interface type='direct'> ... <address type='pci' bus='2' slot='3'/> </controller>
(note that controllers with model='(pci|pcie)-root' will not have any <address> element, because they exist in the basic machine so we don't need to connect them to anywhere.)
Actually I do wonder if we should reprent a PCI root as two <controller> elements, one representing the actual PCI root device, and the other representing the host bridge that is built-in. Also we should use the actual model names, not 'pci-root' or 'pcie-root' but rather i440FX for "pc" machine type, and whatever the q35 model name is. - One PCI root with built-in PCI bus (ie todays' setup) <controller type="pci-root" index="0"> <model name="i440FX"/> </controller> <controller type="pci" index="0"> <!-- Host bridge --> <address type='pci' domain='0' bus='0' slot='0''/> </controller> <interface type='direct'> ... <address type='pci' domain='0' bus='0' slot='3'/> </controller> - One PCI root with built-in PCI bus and extra PCI bridge <controller type="pci-root" index="0"> <model name="i440FX"/> </controller> <controller type="pci" index="0"> <!-- Host bridge --> <address type='pci' domain='0' bus='0' slot='0'/> </controller> <controller type="pci" index="1"> <!-- Additional bridge --> <address type='pci' domain='0' bus='0' slot='1'/> </controller> <interface type='direct'> ... <address type='pci' domain='0' bus='1' slot='3'/> </controller> - One PCI root with built-in PCI bus, PCI-E bus and and extra PCI bridge (ie possible q35 setup) <controller type="pci-root" index="0"> <model name="i440FX"/> </controller> <controller type="pci" index="0"> <!-- Host bridge --> <address type='pci' domain='0' bus='0' slot='0'/> </controller> <controller type="pci" index="1"> <!-- Additional bridge --> <address type='pci' domain='0' bus='0' slot='1'/> </controller> <controller type="pci" index="1"> <!-- Additional bridge --> <address type='pci' domain='0' bus='0' slot='1'/> </controller> <interface type='direct'> ... <address type='pci' domain='0' bus='1' slot='3'/> </controller> So if we later allowed for mutiple PCI roots, then we'd have something like <controller type="pci-root" index="0"> <model name="i440FX"/> </controller> <controller type="pci-root" index="1"> <model name="i440FX"/> </controller> <controller type="pci" index="0"> <!-- Host bridge 1 --> <address type='pci' domain='0' bus='0' slot='0''/> </controller> <controller type="pci" index="0"> <!-- Host bridge 2 --> <address type='pci' domain='1' bus='0' slot='0''/> </controller> <interface type='direct'> <!-- NIC on host bridge 2 --> ... <address type='pci' domain='1' bus='0' slot='3'/> </controller> NB this means that 'index' values can be reused against the <controller>, provided they are setup on different pci-roots.
(also note that it might happen that the bus number in libvirt's config will correspond to the bus numbering that shows up in the guest OS, but that will just be a happy coincidence)
Does this make sense?
Yep, I think we're fairly close. Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On 04/09/2013 04:58 AM, Daniel P. Berrange wrote:
On Mon, Apr 08, 2013 at 03:32:07PM -0400, Laine Stump wrote:
On 04/08/2013 12:48 PM, Daniel P. Berrange wrote:
I think we're starting to get closer to the concrete problem that's bothering me. As I understand it (and again - "what I understand" has repeatedly been shown to be incorrect in this thread :-):
* Ihere are multiple different types of devices that provide a bus with 1 or more "slots" that PCI devices (e.g., the virtio-net-pci device, the e1000 network device, etc) can be plugged into.
* In the config for those devices, there is a required (auto-generated if not explicitly provided) <address> element that indicates what controller that device is plugged into e.g.:
<interface type='direct'> ... <address type='pci' domain='0' bus='0' slot='3' function='0'/> ... </interface>
* domain is always hardcoded to 0, and in the past bus was also always hardcoded to 0 because until now there has only been a single place where PCI devices could be connected - the builtin pci.0 bus, which is a part of the basic "pc" (and some others) virtual machine and provides 32 slots.
* Now we are adding the ability to define new PCI buses, for now just a single kind - a pci-bridge controller, which itself must connect to an existing PCI slot, and provides 32 new PCI slots. But in the future there will be more different types of controllers that provide one or more PCI slots where PCI devices/controllers can be plugged in.
* In these patches adding support for pci-bridge, we are making the assumption that there is a 1:1 correspondence between the "index='n'" attribute of the pci-bridge controller and the "bus='n'" attribute of the <address> element in devices that will be plugged into that controller. So for example if we have:
<controller type='pci-bridge' index='1'> <address type='pci' domain='0' bus='0' slot='10' function='0'/> </controller>
and then change the <interface> definition above to say "bus='1'", that interface device will plug into this new bus at slot 3.
* So let's assume that we add a new controller called "dmi-to-pci-bridge":
<controller type='dmi-to-pci-bridge' index='0'/>
Ignoring for now the question of what address we give in the definition of *this* device (which is itself problematic - do we need a new "pcie" address type?), if some device is then defined with
<address type='pci bus='0' .../>
How do we differentiate between that meaning "the pci-ptp controller that is index='0'" and "the pci-bridge controller that is index='0'"? Do we need to expand our <address> element further? If, as I think you suggest, we have multiple different kinds of controllers that provide PCI slots, each with its own namespace, the current pci address element is inadequate to unambiguously describe where a pci device should be plugged in. Hmm yes, you're right - as long as we only have <adress type='pci'>
On Mon, Apr 08, 2013 at 12:37:49PM -0400, Laine Stump wrote: then all <controller> elements should use type='pci' too, and we should just distinguish based on the model name of the controller. So ignore my previous suggestion to have 'pci-bridge' and 'pci-root' types, we can only use type='pci' on <controller> elements. Okay, so that means we preserve the correlation between
<controller type='pci' index='1'>
and
<address type='pci' bus='1' ..../>
Should the <controller> device use, e.g. <model type='pci-bridge'/> for the model, as is done for <interface> devices? One notable difference is that in the case of <interface> (with the exception of "<model type='virtio'/>"), the model isn't used for anything except passing directly through to qemu (and very recently validating against a list of known interface models), while in the case of controllers with type='pci', different models will have different rules about what they can connect to and what can connect to them, and they will affect what is valid in other devices.
An example on a "pc" machinetype that has the builtin PCI bus, one extra pci-pci bridge, and an interface device plugged into slot 3 of the pci-bridge:
<controller type='pci' index='0'> <model type='pci-root'/> <!-- builtin pci bus --> </controller> <controller type='pci' index='1'> <model type='pci-bridge'/> </controller> <interface type='direct'> ... <address type='pci' bus='1' slot='3'/> </controller>
And for a q35 machinetype that has the root pcie, an i82801b11-bridge connected to slot 1e of that, a pci bridge connected to slot 1 of the i82801b11-bridge, and an interface plugged into slot 3 of the pci-bridge:
<controller type='pci' index='0'> <model type='pcie-root'/> </controller> <controller type='pci' index='1'> <model type='i82801b11-bridge'/> <!-- [*] --> <address type='pci' bus='0' slot='0x1e'/> </controller> <controller type='pci' index='2'> <model type='pci-bridge'/> <address type='pci' bus='1' slot='1'/> </controller> <interface type='direct'> ... <address type='pci' bus='2' slot='3'/> </controller>
(note that controllers with model='(pci|pcie)-root' will not have any <address> element, because they exist in the basic machine so we don't need to connect them to anywhere.) Actually I do wonder if we should reprent a PCI root as two <controller> elements, one representing the actual PCI root device, and the other representing the host bridge that is built-in.
Also we should use the actual model names, not 'pci-root' or 'pcie-root' but rather i440FX for "pc" machine type, and whatever the q35 model name is.
- One PCI root with built-in PCI bus (ie todays' setup)
<controller type="pci-root" index="0"> <model name="i440FX"/> </controller> <controller type="pci" index="0"> <!-- Host bridge --> <address type='pci' domain='0' bus='0' slot='0''/>
Isn't this saying that the bridge connects to itself? (since bus 0 is this bus) I understand (again, possibly wrongly) that the builtin PCI bus connects to the chipset using its own slot 0 (that's why it's reserved), but that's its address on itself. How is this bridge associated with the pci-root? Ah, I *think* I see it - the domain attribute of the pci controller is matched to the index of the pci-root controller, correct? But there's still something strange about the <address> of the pci controller being self-referential.
</controller> <interface type='direct'> ... <address type='pci' domain='0' bus='0' slot='3'/> </controller>
- One PCI root with built-in PCI bus and extra PCI bridge
<controller type="pci-root" index="0"> <model name="i440FX"/> </controller> <controller type="pci" index="0"> <!-- Host bridge --> <address type='pci' domain='0' bus='0' slot='0'/> </controller> <controller type="pci" index="1"> <!-- Additional bridge --> <address type='pci' domain='0' bus='0' slot='1'/> </controller> <interface type='direct'> ... <address type='pci' domain='0' bus='1' slot='3'/> </controller>
- One PCI root with built-in PCI bus, PCI-E bus and and extra PCI bridge (ie possible q35 setup)
Why would a q35 machine have an i440FX pci-root?
<controller type="pci-root" index="0"> <model name="i440FX"/> </controller> <controller type="pci" index="0"> <!-- Host bridge --> <address type='pci' domain='0' bus='0' slot='0'/> </controller> <controller type="pci" index="1"> <!-- Additional bridge --> <address type='pci' domain='0' bus='0' slot='1'/> </controller> <controller type="pci" index="1"> <!-- Additional bridge --> <address type='pci' domain='0' bus='0' slot='1'/> </controller>
I think you did a cut-paste here and intended to change something, but didn't - those two bridges are identical.
<interface type='direct'> ... <address type='pci' domain='0' bus='1' slot='3'/> </controller>
So if we later allowed for mutiple PCI roots, then we'd have something like
<controller type="pci-root" index="0"> <model name="i440FX"/> </controller> <controller type="pci-root" index="1"> <model name="i440FX"/> </controller> <controller type="pci" index="0"> <!-- Host bridge 1 --> <address type='pci' domain='0' bus='0' slot='0''/> </controller> <controller type="pci" index="0"> <!-- Host bridge 2 --> <address type='pci' domain='1' bus='0' slot='0''/> </controller> <interface type='direct'> <!-- NIC on host bridge 2 --> ... <address type='pci' domain='1' bus='0' slot='3'/> </controller>
NB this means that 'index' values can be reused against the <controller>, provided they are setup on different pci-roots.
(also note that it might happen that the bus number in libvirt's config will correspond to the bus numbering that shows up in the guest OS, but that will just be a happy coincidence)
Does this make sense? Yep, I think we're fairly close.
What about the other types of pci controllers that are used by PCIe? We should make sure they fit in this model before we settle on it.

On Tue, Apr 09, 2013 at 04:06:06PM -0400, Laine Stump wrote:
On 04/09/2013 04:58 AM, Daniel P. Berrange wrote:
On Mon, Apr 08, 2013 at 03:32:07PM -0400, Laine Stump wrote: Actually I do wonder if we should reprent a PCI root as two <controller> elements, one representing the actual PCI root device, and the other representing the host bridge that is built-in.
Also we should use the actual model names, not 'pci-root' or 'pcie-root' but rather i440FX for "pc" machine type, and whatever the q35 model name is.
- One PCI root with built-in PCI bus (ie todays' setup)
<controller type="pci-root" index="0"> <model name="i440FX"/> </controller> <controller type="pci" index="0"> <!-- Host bridge --> <address type='pci' domain='0' bus='0' slot='0''/>
Isn't this saying that the bridge connects to itself? (since bus 0 is this bus)
I understand (again, possibly wrongly) that the builtin PCI bus connects to the chipset using its own slot 0 (that's why it's reserved), but that's its address on itself. How is this bridge associated with the pci-root?
Ah, I *think* I see it - the domain attribute of the pci controller is matched to the index of the pci-root controller, correct? But there's still something strange about the <address> of the pci controller being self-referential.
Yes, the index of the pci-root matches the 'domain' of <address>
</controller> <interface type='direct'> ... <address type='pci' domain='0' bus='0' slot='3'/> </controller>
- One PCI root with built-in PCI bus and extra PCI bridge
<controller type="pci-root" index="0"> <model name="i440FX"/> </controller> <controller type="pci" index="0"> <!-- Host bridge --> <address type='pci' domain='0' bus='0' slot='0'/> </controller> <controller type="pci" index="1"> <!-- Additional bridge --> <address type='pci' domain='0' bus='0' slot='1'/> </controller> <interface type='direct'> ... <address type='pci' domain='0' bus='1' slot='3'/> </controller>
- One PCI root with built-in PCI bus, PCI-E bus and and extra PCI bridge (ie possible q35 setup)
Why would a q35 machine have an i440FX pci-root?
It shouldn't, that's a typo
<controller type="pci-root" index="0"> <model name="i440FX"/> </controller> <controller type="pci" index="0"> <!-- Host bridge --> <address type='pci' domain='0' bus='0' slot='0'/> </controller> <controller type="pci" index="1"> <!-- Additional bridge --> <address type='pci' domain='0' bus='0' slot='1'/> </controller> <controller type="pci" index="1"> <!-- Additional bridge --> <address type='pci' domain='0' bus='0' slot='1'/> </controller>
I think you did a cut-paste here and intended to change something, but didn't - those two bridges are identical.
Yep, the slot should be 2 in the second one
<interface type='direct'> ... <address type='pci' domain='0' bus='1' slot='3'/> </controller>
So if we later allowed for mutiple PCI roots, then we'd have something like
<controller type="pci-root" index="0"> <model name="i440FX"/> </controller> <controller type="pci-root" index="1"> <model name="i440FX"/> </controller> <controller type="pci" index="0"> <!-- Host bridge 1 --> <address type='pci' domain='0' bus='0' slot='0''/> </controller> <controller type="pci" index="0"> <!-- Host bridge 2 --> <address type='pci' domain='1' bus='0' slot='0''/> </controller> <interface type='direct'> <!-- NIC on host bridge 2 --> ... <address type='pci' domain='1' bus='0' slot='3'/> </controller>
NB this means that 'index' values can be reused against the <controller>, provided they are setup on different pci-roots.
(also note that it might happen that the bus number in libvirt's config will correspond to the bus numbering that shows up in the guest OS, but that will just be a happy coincidence)
Does this make sense? Yep, I think we're fairly close.
What about the other types of pci controllers that are used by PCIe? We should make sure they fit in this model before we settle on it.
What do they do ? Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On 04/10/2013 05:26 AM, Daniel P. Berrange wrote:
On Tue, Apr 09, 2013 at 04:06:06PM -0400, Laine Stump wrote:
On Mon, Apr 08, 2013 at 03:32:07PM -0400, Laine Stump wrote: Actually I do wonder if we should reprent a PCI root as two <controller> elements, one representing the actual PCI root device, and the other representing the host bridge that is built-in.
Also we should use the actual model names, not 'pci-root' or 'pcie-root' but rather i440FX for "pc" machine type, and whatever the q35 model name is.
- One PCI root with built-in PCI bus (ie todays' setup)
<controller type="pci-root" index="0"> <model name="i440FX"/> </controller> <controller type="pci" index="0"> <!-- Host bridge --> <address type='pci' domain='0' bus='0' slot='0''/> Isn't this saying that the bridge connects to itself? (since bus 0 is
On 04/09/2013 04:58 AM, Daniel P. Berrange wrote: this bus)
I understand (again, possibly wrongly) that the builtin PCI bus connects to the chipset using its own slot 0 (that's why it's reserved), but that's its address on itself. How is this bridge associated with the pci-root?
Ah, I *think* I see it - the domain attribute of the pci controller is matched to the index of the pci-root controller, correct? But there's still something strange about the <address> of the pci controller being self-referential. Yes, the index of the pci-root matches the 'domain' of <address>
Okay, then the way that libvirt differentiates between a pci bridge that is connected to the root, and one that is connected to a slot of another bridge is 1) the "bus" attribute of the bridge's <address> matches the "index" attribute of the bridge itself, and 2) "slot" is always 0. Correct? (The corollary of this is that if slot == 0 and bus != index, or bus == index and slot != 0, it is a configuration error). I'm still unclear on the usefulness of the pci-root controller though - all the necessary information is contained in the pci controller, except for the type of root. But in the case of pcie root, I think you're not allowed to connect a standard bridge to it, only a "dmi-to-pci-bridge" (i82801b11-bridge)
</controller> <interface type='direct'> ... <address type='pci' domain='0' bus='0' slot='3'/> </controller>
- One PCI root with built-in PCI bus and extra PCI bridge
<controller type="pci-root" index="0"> <model name="i440FX"/> </controller> <controller type="pci" index="0"> <!-- Host bridge --> <address type='pci' domain='0' bus='0' slot='0'/> </controller> <controller type="pci" index="1"> <!-- Additional bridge --> <address type='pci' domain='0' bus='0' slot='1'/> </controller> <interface type='direct'> ... <address type='pci' domain='0' bus='1' slot='3'/> </controller>
- One PCI root with built-in PCI bus, PCI-E bus and and extra PCI bridge (ie possible q35 setup) Why would a q35 machine have an i440FX pci-root? It shouldn't, that's a typo
<controller type="pci-root" index="0"> <model name="i440FX"/> </controller> <controller type="pci" index="0"> <!-- Host bridge --> <address type='pci' domain='0' bus='0' slot='0'/> </controller> <controller type="pci" index="1"> <!-- Additional bridge --> <address type='pci' domain='0' bus='0' slot='1'/> </controller> <controller type="pci" index="1"> <!-- Additional bridge --> <address type='pci' domain='0' bus='0' slot='1'/> </controller> I think you did a cut-paste here and intended to change something, but didn't - those two bridges are identical. Yep, the slot should be 2 in the second one
<interface type='direct'> ... <address type='pci' domain='0' bus='1' slot='3'/> </controller>
So if we later allowed for mutiple PCI roots, then we'd have something like
<controller type="pci-root" index="0"> <model name="i440FX"/> </controller> <controller type="pci-root" index="1"> <model name="i440FX"/> </controller> <controller type="pci" index="0"> <!-- Host bridge 1 --> <address type='pci' domain='0' bus='0' slot='0''/> </controller> <controller type="pci" index="0"> <!-- Host bridge 2 --> <address type='pci' domain='1' bus='0' slot='0''/> </controller> <interface type='direct'> <!-- NIC on host bridge 2 --> ... <address type='pci' domain='1' bus='0' slot='3'/> </controller>
NB this means that 'index' values can be reused against the <controller>, provided they are setup on different pci-roots.
(also note that it might happen that the bus number in libvirt's config will correspond to the bus numbering that shows up in the guest OS, but that will just be a happy coincidence)
Does this make sense? Yep, I think we're fairly close. What about the other types of pci controllers that are used by PCIe? We should make sure they fit in this model before we settle on it. What do they do ?
Although I've asked that question, and had it answered, several times now, without a tabular list, I'm still unable to get the answer to that question clear in my mind :-/ Here's the controllers that aw has talked about (in a recent response on a different sub-thread): On 04/05/2013 03:26 PM, Alex Williamson wrote:
For PCIe, we create new buses for root ports (ioh3420), upstream switch ports (xio3130-upstream), downstream switch ports (xio3130-downstream), and the dmi-to-pci bridge (i82801b11-bridge). For PCI, PCI-to-PCI bridges create new buses (pci-bridge and dec-21154).
Alex: what do each of these connect to, and what can be connected to them? root-port (ioh3420) upstream-switch-port (xio3130-upstream) downstream-switch-port (xio3130-downstream) dmi-to-pci-bridge (i82801b11-bridge) We already know/understand this one: pci-bridge; is this identical in behavior/function? dec-21154
One of my goals is to move us away from emulation of specific chips and create more devices like pci-bridge that adhere to the standard, but don't try to emulate a specific device. Then we might have "root-port", "pcie-upstream-switch-port", "pcie-downstream-switch-port", and "dmi-to-pci-bridge" (none of these names have been discussed).

On Thu, Apr 11, 2013 at 07:03:56AM -0400, Laine Stump wrote:
On 04/10/2013 05:26 AM, Daniel P. Berrange wrote:
On Tue, Apr 09, 2013 at 04:06:06PM -0400, Laine Stump wrote:
On Mon, Apr 08, 2013 at 03:32:07PM -0400, Laine Stump wrote: Actually I do wonder if we should reprent a PCI root as two <controller> elements, one representing the actual PCI root device, and the other representing the host bridge that is built-in.
Also we should use the actual model names, not 'pci-root' or 'pcie-root' but rather i440FX for "pc" machine type, and whatever the q35 model name is.
- One PCI root with built-in PCI bus (ie todays' setup)
<controller type="pci-root" index="0"> <model name="i440FX"/> </controller> <controller type="pci" index="0"> <!-- Host bridge --> <address type='pci' domain='0' bus='0' slot='0''/> Isn't this saying that the bridge connects to itself? (since bus 0 is
On 04/09/2013 04:58 AM, Daniel P. Berrange wrote: this bus)
I understand (again, possibly wrongly) that the builtin PCI bus connects to the chipset using its own slot 0 (that's why it's reserved), but that's its address on itself. How is this bridge associated with the pci-root?
Ah, I *think* I see it - the domain attribute of the pci controller is matched to the index of the pci-root controller, correct? But there's still something strange about the <address> of the pci controller being self-referential. Yes, the index of the pci-root matches the 'domain' of <address>
Okay, then the way that libvirt differentiates between a pci bridge that is connected to the root, and one that is connected to a slot of another bridge is 1) the "bus" attribute of the bridge's <address> matches the "index" attribute of the bridge itself, and 2) "slot" is always 0. Correct?
(The corollary of this is that if slot == 0 and bus != index, or bus == index and slot != 0, it is a configuration error).
I'm still unclear on the usefulness of the pci-root controller though - all the necessary information is contained in the pci controller, except for the type of root. But in the case of pcie root, I think you're not allowed to connect a standard bridge to it, only a "dmi-to-pci-bridge" (i82801b11-bridge)
Yes you can connect a pci bridge to pcie-root. It's represented as a root complex integrated device.
</controller> <interface type='direct'> ... <address type='pci' domain='0' bus='0' slot='3'/> </controller>
- One PCI root with built-in PCI bus and extra PCI bridge
<controller type="pci-root" index="0"> <model name="i440FX"/> </controller> <controller type="pci" index="0"> <!-- Host bridge --> <address type='pci' domain='0' bus='0' slot='0'/> </controller> <controller type="pci" index="1"> <!-- Additional bridge --> <address type='pci' domain='0' bus='0' slot='1'/> </controller> <interface type='direct'> ... <address type='pci' domain='0' bus='1' slot='3'/> </controller>
- One PCI root with built-in PCI bus, PCI-E bus and and extra PCI bridge (ie possible q35 setup) Why would a q35 machine have an i440FX pci-root? It shouldn't, that's a typo
<controller type="pci-root" index="0"> <model name="i440FX"/> </controller> <controller type="pci" index="0"> <!-- Host bridge --> <address type='pci' domain='0' bus='0' slot='0'/> </controller> <controller type="pci" index="1"> <!-- Additional bridge --> <address type='pci' domain='0' bus='0' slot='1'/> </controller> <controller type="pci" index="1"> <!-- Additional bridge --> <address type='pci' domain='0' bus='0' slot='1'/> </controller> I think you did a cut-paste here and intended to change something, but didn't - those two bridges are identical. Yep, the slot should be 2 in the second one
<interface type='direct'> ... <address type='pci' domain='0' bus='1' slot='3'/> </controller>
So if we later allowed for mutiple PCI roots, then we'd have something like
<controller type="pci-root" index="0"> <model name="i440FX"/> </controller> <controller type="pci-root" index="1"> <model name="i440FX"/> </controller> <controller type="pci" index="0"> <!-- Host bridge 1 --> <address type='pci' domain='0' bus='0' slot='0''/> </controller> <controller type="pci" index="0"> <!-- Host bridge 2 --> <address type='pci' domain='1' bus='0' slot='0''/> </controller> <interface type='direct'> <!-- NIC on host bridge 2 --> ... <address type='pci' domain='1' bus='0' slot='3'/> </controller>
NB this means that 'index' values can be reused against the <controller>, provided they are setup on different pci-roots.
(also note that it might happen that the bus number in libvirt's config will correspond to the bus numbering that shows up in the guest OS, but that will just be a happy coincidence)
Does this make sense? Yep, I think we're fairly close. What about the other types of pci controllers that are used by PCIe? We should make sure they fit in this model before we settle on it. What do they do ?
Although I've asked that question, and had it answered, several times now, without a tabular list, I'm still unable to get the answer to that question clear in my mind :-/
Here's the controllers that aw has talked about (in a recent response on a different sub-thread):
On 04/05/2013 03:26 PM, Alex Williamson wrote:
For PCIe, we create new buses for root ports (ioh3420), upstream switch ports (xio3130-upstream), downstream switch ports (xio3130-downstream), and the dmi-to-pci bridge (i82801b11-bridge). For PCI, PCI-to-PCI bridges create new buses (pci-bridge and dec-21154).
Alex: what do each of these connect to, and what can be connected to them?
root-port (ioh3420)
this is connected to the root bus. any express device connects here (at most one device but it can be multifunction)
upstream-switch-port (xio3130-upstream)
this connects to a root port or downstream port, as any express device. only downstream ports connect here.
downstream-switch-port (xio3130-downstream)
this connects to the upstream port. any express device connects here (at most one device but it can be multifunction)
dmi-to-pci-bridge (i82801b11-bridge)
this connects to the root bus. any pci (non express) device connects here
We already know/understand this one: pci-bridge; is this identical in behavior/function? dec-21154
Yes, only dec pretends it's an intel bridge device, pci-bridge tells guest it's a generic device. Does not seem to matter with any guest I tried.
One of my goals is to move us away from emulation of specific chips and create more devices like pci-bridge that adhere to the standard, but don't try to emulate a specific device. Then we might have "root-port", "pcie-upstream-switch-port", "pcie-downstream-switch-port", and "dmi-to-pci-bridge" (none of these names have been discussed).

On 04/11/2013 07:23 AM, Michael S. Tsirkin wrote:
On Thu, Apr 11, 2013 at 07:03:56AM -0400, Laine Stump wrote:
On 04/10/2013 05:26 AM, Daniel P. Berrange wrote:
On Tue, Apr 09, 2013 at 04:06:06PM -0400, Laine Stump wrote:
On Mon, Apr 08, 2013 at 03:32:07PM -0400, Laine Stump wrote: Actually I do wonder if we should reprent a PCI root as two <controller> elements, one representing the actual PCI root device, and the other representing the host bridge that is built-in.
Also we should use the actual model names, not 'pci-root' or 'pcie-root' but rather i440FX for "pc" machine type, and whatever the q35 model name is.
- One PCI root with built-in PCI bus (ie todays' setup)
<controller type="pci-root" index="0"> <model name="i440FX"/> </controller> <controller type="pci" index="0"> <!-- Host bridge --> <address type='pci' domain='0' bus='0' slot='0''/> Isn't this saying that the bridge connects to itself? (since bus 0 is
On 04/09/2013 04:58 AM, Daniel P. Berrange wrote: this bus)
I understand (again, possibly wrongly) that the builtin PCI bus connects to the chipset using its own slot 0 (that's why it's reserved), but that's its address on itself. How is this bridge associated with the pci-root?
Ah, I *think* I see it - the domain attribute of the pci controller is matched to the index of the pci-root controller, correct? But there's still something strange about the <address> of the pci controller being self-referential. Yes, the index of the pci-root matches the 'domain' of <address>
Okay, then the way that libvirt differentiates between a pci bridge that is connected to the root, and one that is connected to a slot of another bridge is 1) the "bus" attribute of the bridge's <address> matches the "index" attribute of the bridge itself, and 2) "slot" is always 0. Correct?
(The corollary of this is that if slot == 0 and bus != index, or bus == index and slot != 0, it is a configuration error).
I'm still unclear on the usefulness of the pci-root controller though - all the necessary information is contained in the pci controller, except for the type of root. But in the case of pcie root, I think you're not allowed to connect a standard bridge to it, only a "dmi-to-pci-bridge" (i82801b11-bridge) Yes you can connect a pci bridge to pcie-root. It's represented as a root complex integrated device.
ARGHH!! Just when I think I'm starting to understand *something* about these devices... (later edit: after some coaching on IRC, I *think* I've got a bit better handle on it.)
</controller> <interface type='direct'> ... <address type='pci' domain='0' bus='0' slot='3'/> </controller>
- One PCI root with built-in PCI bus and extra PCI bridge
<controller type="pci-root" index="0"> <model name="i440FX"/> </controller> <controller type="pci" index="0"> <!-- Host bridge --> <address type='pci' domain='0' bus='0' slot='0'/> </controller> <controller type="pci" index="1"> <!-- Additional bridge --> <address type='pci' domain='0' bus='0' slot='1'/> </controller> <interface type='direct'> ... <address type='pci' domain='0' bus='1' slot='3'/> </controller>
- One PCI root with built-in PCI bus, PCI-E bus and and extra PCI bridge (ie possible q35 setup) Why would a q35 machine have an i440FX pci-root? It shouldn't, that's a typo
<controller type="pci-root" index="0"> <model name="i440FX"/> </controller> <controller type="pci" index="0"> <!-- Host bridge --> <address type='pci' domain='0' bus='0' slot='0'/> </controller> <controller type="pci" index="1"> <!-- Additional bridge --> <address type='pci' domain='0' bus='0' slot='1'/> </controller> <controller type="pci" index="1"> <!-- Additional bridge --> <address type='pci' domain='0' bus='0' slot='1'/> </controller> I think you did a cut-paste here and intended to change something, but didn't - those two bridges are identical. Yep, the slot should be 2 in the second one
<interface type='direct'> ... <address type='pci' domain='0' bus='1' slot='3'/> </controller>
So if we later allowed for mutiple PCI roots, then we'd have something like
<controller type="pci-root" index="0"> <model name="i440FX"/> </controller> <controller type="pci-root" index="1"> <model name="i440FX"/> </controller> <controller type="pci" index="0"> <!-- Host bridge 1 --> <address type='pci' domain='0' bus='0' slot='0''/> </controller> <controller type="pci" index="0"> <!-- Host bridge 2 --> <address type='pci' domain='1' bus='0' slot='0''/> </controller>
There is a problem here - within a given controller type, we will now have the possibility of multiple controllers with the same index - the differentiating attribute will be in the <address> subelement, which could create some awkwardness. Maybe instead this should be handled with a different model of pci controller, and we can add a "domain" attribute at the toplevel rather than specifying an <address>?
<interface type='direct'> <!-- NIC on host bridge 2 --> ... <address type='pci' domain='1' bus='0' slot='3'/> </controller>
NB this means that 'index' values can be reused against the <controller>, provided they are setup on different pci-roots.
(also note that it might happen that the bus number in libvirt's config will correspond to the bus numbering that shows up in the guest OS, but that will just be a happy coincidence)
Does this make sense? Yep, I think we're fairly close. What about the other types of pci controllers that are used by PCIe? We should make sure they fit in this model before we settle on it. What do they do ?
(The descriptions of different models below tell what each of these other devices does; in short, they're all just some sort of electronic Lego to help connect PCI and PCIe devices into a tree). Okay, I'll make yet another attempt at understanding these devices, and suggesting how they can all be described in the XML. I'm thinking that *all* of the express hubs, switch ports, bridges, etc can be described in xml in the manner above, i.e.: <controller type='pci' index='n'> <model type='xxx'/> </controller> and that the method for connecting a device to any of them would be by specifying: <address type='pci' domain='n' bus='n' slot='n' function='n'/> Any limitations about which devices/controllers can connect to which controllers, and how many devices can connect to any particular controller will be derived from the <model type='xxx'/>. (And, as we've said before, although qemu doesn't assign each of these controllers a numeric bus id, and although we can make no guarantee that the bus id we use for a particular controller is what will be used by the guest BIOS/OS, it's still a convenient notation and works well with other hypervisors as well as qemu. I'll also note that when I run lspci on an X58-based machine I have here, *all* of the relationships between all the devices listed below are described with simple bus:slot.function numbers.) Here is a list of the pci controller model types and their restrictions (thanks to mst and aw for repeating these over and over to me; I'm sure I still have made mistakes, but at least it's getting closer). <controller type='pci-root'> ============================ Upstream: nothing Downstream: only a single pci-root-bus (implied) qemu commandline: nothing (it's implied in the q35 machinetype) Explanation: Each machine will have a different controller called "pci-root" as outlined above by Daniel. Two types of pci-root will be supported: i440FX and q35. If a pci-root is not spelled out in the config, one will be auto-added (depending on machinetype). An i440FX pci-root has an implicitly added pci-bridge at 0:0:0.0 (and any bridge that has an address of slot='0' on its own bus is, by definition, connected to a pci-root controller - the two are matched by setting "domain" in the address of the pci-bridge to "index" of the pci-root). This bridge can only have PCI devices added. A q35 pci-root also implies a different kind of pci-bridge device - one that can only have PCIe devices/controllers attached, but is otherwise identical to the pci-bridge added for i440FX. This bus will be called "root-bus" (Note that there are generally followed conventions for what can be connected to which slot on this bus, and we will probably follow those conventions when building a machine, *but* we will not hardcode this convention into libvirt; each q35 machine will be an empty slate) <controller type='pci'> ======================= This will be used for *all* of the following controller devices supported by qemu: <model type='pcie-root-bus'/> (implicit/integrated) ---------------------------- Upstream: connect to pci-root controller *only* Downstream: 32 slots, PCIe devices only, no hotplug. qemu commandline: nothing (implicit in the q35-* machinetype) This controller is the bus described above that connects to a q35's pci-root, and provides places for PCIe devices to connect. Examples are root-ports, dmi-to-pci-bridges sata controllers, integrated sound/usb/ethernet devices (do any of those that can be connected to the pcie-root-bus exist yet?). There is only one of these controllers, and it will *always* be index='0', and will always have the following address: <address type='pci' domain='0' bus='0' slot='0' function='0'/> <model type='root-port'/> (ioh3420) ------------------------- Upstream: PCIe, connect to pcie-root-bus *only* (?) Downstream: 1 slot, PCIe devices only (?) qemu commandline: -device ioh3420,... These can only connect to the "pcie-root-bus" of of a q35 (implying that this bus will need to have a different model name than the simple "pci-bridge" <model type='dmi-to-pci-bridge'/> (i82801b11-bridge) --------------------------------- (btw, what does "dmi" mean?) Upstream: pcie-root-bus *only* Downstream: 32 slots, any PCI device, no hotplug (?) qemu commandline: -device i82801b11-bridge,... <model type='upstream-switch-port'/> (x3130-upstream) ------------------------------------ Upstream: PCIe, connect to pcie-root-bus, root-port, or downstream-switch-port (?) Downstream: 32 slots, connect *only* to downstream-switch-port qemu-commandline: -device x3130-upstream This is the upper side of a switch that can multiplex multiple devices onto a single port. It's only useful when one or more downstream switch ports are connected to it. <model type='downstream-switch-port'/> (xio3130-downstream) -------------------------------------- Upstream: connect *only* to upstream-switch-port Downstream: 1 slot, any PCIe device qemu commandline: -device xio3130-downstream You can connect one or more of these to an upstream-switch-port in order to effectively plug multiple devices into a single PCIe port. <model type='pci-bridge'/> (pci-bridge) -------------------------- Upstream: PCI, connect to 1) pci-root, 2) dmi-to-pci-bridge, 3) another pci-bridge Downstream: any PCI device, 32 slots qemu commandline: -device pci-bridge,... This differs from dmi-to-pci-bridge in that its upstream connection is PCI rather than PCIe (so it will work on an i440FX system, which has no root PCIe bus) and that hotplug is supported. In general, if a guest will have any PCI devices, one of these controllers should be added, and =============================================================== Comment: I'm not quite convinced that we really need the separate "pci-root" device. Since 1) every pci-root will *always* have either a pcie-root-bus or a pci-bridge connected to it, 2) the pci-root-bus will only ever be connected to the pci-root, and 3) the pci-bridge that connects to it will need special handling within the pci-bridge case anyway, why not: 1) eliminate the separate pci-root controller type 2) within <controller type='pci'>, a new <model type='pci-root-bus'/> will be added. 3) a pcie-root-bus will automatically be added for q35 machinetypes, and pci-root-bus for any machinetype that supports a PCI bus (e.g. "pc-*") 4) model type='pci-root-bus' will behave like pci-bridge, except that it will be an implicit device (nothing on qemu commandline) and it won't need an <address> element (neither will pcie-root-bus). 5) to support multiple domains, we can simply add a "domain" attribute to the toplevel of controller.

On Fri, Apr 12, 2013 at 11:46:15AM -0400, Laine Stump wrote:
On 04/11/2013 07:23 AM, Michael S. Tsirkin wrote:
On Thu, Apr 11, 2013 at 07:03:56AM -0400, Laine Stump wrote:
On 04/10/2013 05:26 AM, Daniel P. Berrange wrote:
So if we later allowed for mutiple PCI roots, then we'd have something like
<controller type="pci-root" index="0"> <model name="i440FX"/> </controller> <controller type="pci-root" index="1"> <model name="i440FX"/> </controller> <controller type="pci" index="0"> <!-- Host bridge 1 --> <address type='pci' domain='0' bus='0' slot='0''/> </controller> <controller type="pci" index="0"> <!-- Host bridge 2 --> <address type='pci' domain='1' bus='0' slot='0''/> </controller>
There is a problem here - within a given controller type, we will now have the possibility of multiple controllers with the same index - the differentiating attribute will be in the <address> subelement, which could create some awkwardness. Maybe instead this should be handled with a different model of pci controller, and we can add a "domain" attribute at the toplevel rather than specifying an <address>?
IIUC there is a limit on the number of PCI buses you can create per domain, due to fixed size of PCI addresses. Google suggests to me the limit is 256. So for domain 1, we could just start index at 256, and domain 2 at 512, etc
Comment: I'm not quite convinced that we really need the separate "pci-root" device. Since 1) every pci-root will *always* have either a pcie-root-bus or a pci-bridge connected to it, 2) the pci-root-bus will only ever be connected to the pci-root, and 3) the pci-bridge that connects to it will need special handling within the pci-bridge case anyway, why not:
1) eliminate the separate pci-root controller type
Ok, lets leave it out - we can always add it later if desired.
2) within <controller type='pci'>, a new <model type='pci-root-bus'/> will be added.
3) a pcie-root-bus will automatically be added for q35 machinetypes, and pci-root-bus for any machinetype that supports a PCI bus (e.g. "pc-*")
4) model type='pci-root-bus' will behave like pci-bridge, except that it will be an implicit device (nothing on qemu commandline) and it won't need an <address> element (neither will pcie-root-bus).
5) to support multiple domains, we can simply add a "domain" attribute to the toplevel of controller.
Or use index numbers modulo 256 to identify domain numbers. One note on q35 - we need to make sure whatever we do in terms of creating default <controller>s in the XML 'just works' for applications. eg if they define a guest using <type machine="q35">hvm</type>, and then add a <interface>, it should do the right thing wrt PCI addressing/connectivity. We must not require applications to manually add <controller> elements for q35 for things to work. Adding <controller>s must purely be an opt-in for apps which have the detailed knowledge rquired & need full control over bus layout. Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On 04/15/2013 06:29 AM, Daniel P. Berrange wrote:
On 04/11/2013 07:23 AM, Michael S. Tsirkin wrote:
On Thu, Apr 11, 2013 at 07:03:56AM -0400, Laine Stump wrote:
On 04/10/2013 05:26 AM, Daniel P. Berrange wrote:
> So if we later allowed for mutiple PCI roots, then we'd have something > like > > <controller type="pci-root" index="0"> > <model name="i440FX"/> > </controller> > <controller type="pci-root" index="1"> > <model name="i440FX"/> > </controller> > <controller type="pci" index="0"> <!-- Host bridge 1 --> > <address type='pci' domain='0' bus='0' slot='0''/> > </controller> > <controller type="pci" index="0"> <!-- Host bridge 2 --> > <address type='pci' domain='1' bus='0' slot='0''/> > </controller>
There is a problem here - within a given controller type, we will now have the possibility of multiple controllers with the same index - the differentiating attribute will be in the <address> subelement, which could create some awkwardness. Maybe instead this should be handled with a different model of pci controller, and we can add a "domain" attribute at the toplevel rather than specifying an <address>? IIUC there is a limit on the number of PCI buses you can create per domain, due to fixed size of PCI addresses. Google suggests to me
On Fri, Apr 12, 2013 at 11:46:15AM -0400, Laine Stump wrote: the limit is 256. So for domain 1, we could just start index at 256, and domain 2 at 512, etc
Okay. Whether we choose that method, or a separate domain attribute, I'm satisfied that we'll be able to find a way to solve it when the time comes (and it hasn't yet), so we can ignore that problem for now.
Comment: I'm not quite convinced that we really need the separate "pci-root" device. Since 1) every pci-root will *always* have either a pcie-root-bus or a pci-bridge connected to it, 2) the pci-root-bus will only ever be connected to the pci-root, and 3) the pci-bridge that connects to it will need special handling within the pci-bridge case anyway, why not:
1) eliminate the separate pci-root controller type Ok, lets leave it out - we can always add it later if desired.
Okay.
2) within <controller type='pci'>, a new <model type='pci-root-bus'/> will be added.
3) a pcie-root-bus will automatically be added for q35 machinetypes, and pci-root-bus for any machinetype that supports a PCI bus (e.g. "pc-*")
4) model type='pci-root-bus' will behave like pci-bridge, except that it will be an implicit device (nothing on qemu commandline) and it won't need an <address> element (neither will pcie-root-bus).
5) to support multiple domains, we can simply add a "domain" attribute to the toplevel of controller. Or use index numbers modulo 256 to identify domain numbers.
Right. One or the other. But we can defer that discussion.
One note on q35 - we need to make sure whatever we do in terms of creating default <controller>s in the XML 'just works' for applications. eg if they define a guest using <type machine="q35">hvm</type>, and then add a <interface>, it should do the right thing wrt PCI addressing/connectivity. We must not require applications to manually add <controller> elements for q35 for things to work. Adding <controller>s must purely be an opt-in for apps which have the detailed knowledge rquired & need full control over bus layout.
Yep. What I see happening is that the place where we currently add default controllers will, in the future, automatically add this for machinetype pc* and rhel-*: <controller type='pci'> <!-- implied index='0' --> <model type='pci-root'/> </controller> and for machinetype q35* it will add (something like): <controller type='pci'> <!-- index='0' --> <model type='pcie-root'/> </controller> <controller type='pci'> <!-- index='1' --> <model type='dmi-to-pci-bridge'/> <address type='pci' bus='0' slot='0x1e'/> </controller> <controller type='pci'> <!-- index='2' --> <model type='pci-bridge'> <address type='pci' bus='1' slot='1'/> </controller> The slot-auto-reserve code will look through all pci controllers and only auto-reserve slots on controllers appropriate for the given device - controller 0 is already inappropriate for PCI devices, and we can mark the dmi-to-pci-bridge type as being inappropriate for auto-reserve (since, if I recall correctly, I was told that you can't hotplug devices on that bus). So, all new PCI devices in the config will get addresses with bus='2'. Of course this means that it will not be possible to switch an existing domain config from pc to q35 simply by changing the machinetype - the bus number in the address of all devices will need to be changed from 0 to 2. But this is another case of "opt in", and already requires editing the domain config anyway. If someone creates a brand new q35 machine though, all PCI devices will get added with bus='whatever is the bus number of the first pci-root or pci-bridge controller' (in this case, '2'). So, here are the proposed pci controller types cleaned up an re-summarized, followed by an example. <controller type='pci'> ======================= This will be used for *all* of the following PCI controller devices supported by qemu: <model type='pci-root'/> (implicit/integrated) ------------------------ Upstream: implicit connection to the host Downstream: 32 slots (slot 0 reserved), PCI devices only qemu commandline: nothing (implicit in the pc-* etc. machinetypes) This controller represents a pc* (or rhel-*) machine's integrated PCI bus (pci.0) and provides places for PCI devices to connect (including the "pci-bridge" type of PCI controller). There is only one of these controllers, and it will *always* be index='0', and will have no <address> element. <model type='pcie-root'/> (implicit/integrated) ------------------------- Upstream: implicit connection to the host Downstream: 32 slots (slot 0 reserved), PCIe devices only, no hotplug. qemu commandline: nothing (implicit in the q35-* machinetype) This controller represents a q35's PCI "root complex", and provides places for PCIe devices to connect. Examples are root-ports, dmi-to-pci-bridges sata controllers, integrated sound/usb/ethernet devices (do any of those integrated devices that can be connected to the pcie-root-bus exist yet?). There is only one of these controllers, and it will *always* be index='0', and will have no <address> element. <model type='root-port'/> (ioh3420) ------------------------- Upstream: PCIe, connect to pcie-root-bus *only* (?) Downstream: 1 slot, PCIe devices only (?) qemu commandline: -device ioh3420,... These can only connect to the "pcie-root" of a q35. Any PCIe devices can connect to it, including an upstream-switch-port. <model type='upstream-switch-port'/> (x3130-upstream) ------------------------------------ Upstream: PCIe, connect to pcie-root-bus, root-port, or downstream-switch-port (?) Downstream: 32 slots, connect *only* to downstream-switch-port qemu-commandline: -device x3130-upstream This is the upper side of a switch that can multiplex multiple devices onto a single port. It's only useful when one or more downstream switch ports are connected to it. <model type='downstream-switch-port'/> (xio3130-downstream) -------------------------------------- Upstream: connect *only* to upstream-switch-port Downstream: 1 slot, any PCIe device qemu commandline: -device xio3130-downstream You can connect one or more of these to an upstream-switch-port in order to effectively plug multiple devices into a single PCIe port. <model type='dmi-to-pci-bridge'/> (i82801b11-bridge) --------------------------------- (btw, what does "dmi" mean?) Upstream: pcie-root *only* Downstream: 32 slots, any PCI device (including "pci-bridge"), no hotplug (?) qemu commandline: -device i82801b11-bridge,... This is the gateway to the world of standard old PCI. <model type='pci-bridge'/> (pci-bridge) -------------------------- Upstream: PCI, connect to 1) pci-root, 2) dmi-to-pci-bridge 3) another pci-bridge Downstream: any PCI device, 32 slots qemu commandline: -device pci-bridge,... This differs from dmi-to-pci-bridge in that its upstream connection is PCI rather than PCIe (so it will work on an i440FX system, which has a pci-root rather than pcie-root) and that hotplug is supported. In general, if a guest will have any PCI devices, one of these controllers should be added, and the PCI devices connected to it rather than to the dmi-to-pci-bridge. ************************************ (For q35, we *may* decide to always auto-add a dmi-to-pci-bridge at 00:1E.0, and a pci-bridge on slot 1 of the dmi-to-pci-bridge. This will allow a continuation of the tradition of simply adding new devices to the config without worrying about where they connect.) ============================================================================ Just to make sure this config model will work, here is the XML to replicate the layout (only the ones involved in the PCI tree, along with 3 ethernet devices as examples) of the X58 hardware I have sitting under my desk (I've attached lspci and virsh nodedev-list --tree output from that machine): <controller type='pci' index='0'> <model type='pcie-root'/> </controller> <controller type='pci' index='1'> <model type='root-port'/> <address type='pci' bus='0' slot='1'/> </controller> ( there is a scsi controller connected to bus='1') <controller type='pci' index='2'> <model type='root-port'/> <address type='pci' bus='0' slot='3'/> </controller> (the VGA controller is connected to bus='2') <controller type='pci' index='3'> <model type='root-port'/> <address type='pci' bus='0' slot='7'/> </controller> (PCIe SRIOV network card (in external PCIe slot) connected to bus='3') <controller type='pci' index='4'> <model type='root-port'/> <address type='pci' bus='0' slot='0x1c' function='0'/> </controller> (unused PCIe slot available on bus='4') <!-- pcie-root (0:1c.4) -> root-port (5:0.0) -> onboard ethernet -> <controller type='pci' index='5'> <model type='root-port'/> <address type='pci' bus='0' slot='0x1c' function='4'/> </controller> <interface type='blah'> ... <mac address='00:27:13:53:db:76'/> <address type='pci' bus='5' slot='0' function='0'/> </interface> <!-- more complicated connection to 2nd systemboard ethernet --> <!-- pcie-root ->(0:1c:5)root-port -> (6:0.0)upstream-switch-port -> (7:3.0)downstream-switch-port -> (9:0.0)ethernet --> <controller type='pci' index='6'> <model type='root-port'/> <address type='pci' bus='0' slot='0x1c' function='5'/> </controller> <controller type='pci' index='7'> <model type='upstream-switch-port'/> <address type='pci' bus='6' slot='0' function='0'/> </controller> <controller type='pci' index='8'> <model type='downstream-switch-port'/> <address type='pci' bus='7' slot='2' function='0'/> </controller> <controller type='pci' index='9'> <model type='downstream-switch-port'/> <address type='pci' bus='7' slot='3' function='0'/> </controller> <interface type='blah'> ... <mac address='00:27:13:53:db:77'/> <address type='pci' bus='9' slot='0' function='0'/> </interface> <!-- old-fashioned PCI ethernet in an external PCI slot --> <controller type='pci' index='0x0a'> <model type='dmi-to-pci-bridge'/> <address type='pci' bus='0x1e' slot='0' function='0'/> </controller> <interface type='blah'> ... <mac address='00:03:47:7b:63:e6'/> <address type='pci' bus='0x0a' slot='0x0e' function='0'/> </interface> So I think this will all work. Does anyone see any problems? If not, then we can draw it all back to the *current* patchset - support for multiple PCI buses using the pci-bridge device. For *that*, we only need to implement the follow bits of the above: 1) There will be a new <controller type='pci'> device, with a <model type='xyz'/> subelement. Initially we will support types "pci-root" and "pci-bridge" (all the other types discussed above can be added later). pci-root will have *no <address>* element (and will generate nothing on the qemu commandline, but will create a 32 slot "bus='0'" to plug PCI devices into). pci-bridge will have an <address> element, will generate a -device option on the qemu commandline, and will also create a 32 slot "bus='n'" to plug PCI devices into. 2) for machinetypes that have a PCI bus, the config should have this controller auto-added: <controller type='pci'> <model type='pci-root'/> </controller> This will make bus='0' available (but add nothing to the qemu commandline). Any attempt to add a PCI device when there is no bus available should be an error. 3) The way to add more buses will be to add a controller like this: <controller type='pci'> <model type='pci-bridge'/> </controller> 4) When <controller type='usb'> was added, resulting in auto-generated devices, that caused problems when migrating from a host with newer libvirt to one with older libvirt. We need to make sure we don't suffer the same problem this time. See the following two BZes for details (unless you have a better memory than me! :-): https://bugzilla.redhat.com/show_bug.cgi?id=815503 https://bugzilla.redhat.com/show_bug.cgi?id=856864 (and note how danpb eerily prophesied the current pending situation :-) I think everything else about Jan's/Liguang's pci-bridge patches can remain.

On 04/15/2013 04:09 PM, Laine Stump wrote:
On 04/15/2013 06:29 AM, Daniel P. Berrange wrote:
On 04/11/2013 07:23 AM, Michael S. Tsirkin wrote:
On Thu, Apr 11, 2013 at 07:03:56AM -0400, Laine Stump wrote:
On 04/10/2013 05:26 AM, Daniel P. Berrange wrote:
>> So if we later allowed for mutiple PCI roots, then we'd have something >> like >> >> <controller type="pci-root" index="0"> >> <model name="i440FX"/> >> </controller> >> <controller type="pci-root" index="1"> >> <model name="i440FX"/> >> </controller> >> <controller type="pci" index="0"> <!-- Host bridge 1 --> >> <address type='pci' domain='0' bus='0' slot='0''/> >> </controller> >> <controller type="pci" index="0"> <!-- Host bridge 2 --> >> <address type='pci' domain='1' bus='0' slot='0''/> >> </controller>
There is a problem here - within a given controller type, we will now have the possibility of multiple controllers with the same index - the differentiating attribute will be in the<address> subelement, which could create some awkwardness. Maybe instead this should be handled with a different model of pci controller, and we can add a "domain" attribute at the toplevel rather than specifying an<address>? IIUC there is a limit on the number of PCI buses you can create per domain, due to fixed size of PCI addresses. Google suggests to me
On Fri, Apr 12, 2013 at 11:46:15AM -0400, Laine Stump wrote: the limit is 256. So for domain 1, we could just start index at 256, and domain 2 at 512, etc
Okay. Whether we choose that method, or a separate domain attribute, I'm satisfied that we'll be able to find a way to solve it when the time comes (and it hasn't yet), so we can ignore that problem for now.
*PLEASE* don't create a new/competing naming/numbering scheme for differentiating PCI domains.... as much as I dislike the overuse of the term 'domain', it's what is used. No sane person is going to look to assign PCI bus numbers > 256 in order to get new/different domains. The name sucks, but that's what it's called in the code, and what customers are use to.
Comment: I'm not quite convinced that we really need the separate "pci-root" device. Since 1) every pci-root will *always* have either a pcie-root-bus or a pci-bridge connected to it, 2) the pci-root-bus will only ever be connected to the pci-root, and 3) the pci-bridge that connects to it will need special handling within the pci-bridge case anyway, why not:
1) eliminate the separate pci-root controller type Ok, lets leave it out - we can always add it later if desired.
Okay.
Not so fast.... something that represents the PCI Root Complex might be handy -- error handling and embedded devices (like IOMMUs, intr-remapping table) come to mind... ACPI tables (if they get duped from real systems) may need unconventional naming schemes for qemu if an RC isn't modelled.
2) within<controller type='pci'>, a new<model type='pci-root-bus'/> will be added.
3) a pcie-root-bus will automatically be added for q35 machinetypes, and pci-root-bus for any machinetype that supports a PCI bus (e.g. "pc-*")
4) model type='pci-root-bus' will behave like pci-bridge, except that it will be an implicit device (nothing on qemu commandline) and it won't need an<address> element (neither will pcie-root-bus).
5) to support multiple domains, we can simply add a "domain" attribute to the toplevel of controller. Or use index numbers modulo 256 to identify domain numbers.
Right. One or the other. But we can defer that discussion.
Just say 'domain' .... again! ;-)
One note on q35 - we need to make sure whatever we do in terms of creating default<controller>s in the XML 'just works' for applications. eg if they define a guest using<type machine="q35">hvm</type>, and then add a <interface>, it should do the right thing wrt PCI addressing/connectivity. We must not require applications to manually add<controller> elements for q35 for things to work. Adding<controller>s must purely be an opt-in for apps which have the detailed knowledge rquired& need full control over bus layout.
Yep. What I see happening is that the place where we currently add default controllers will, in the future, automatically add this for machinetype pc* and rhel-*:
<controller type='pci'> <!-- implied index='0' --> <model type='pci-root'/> </controller>
and for machinetype q35* it will add (something like):
<controller type='pci'> <!-- index='0' --> <model type='pcie-root'/> </controller> <controller type='pci'> <!-- index='1' --> <model type='dmi-to-pci-bridge'/> <address type='pci' bus='0' slot='0x1e'/> </controller> <controller type='pci'> <!-- index='2' --> <model type='pci-bridge'> <address type='pci' bus='1' slot='1'/> </controller>
The slot-auto-reserve code will look through all pci controllers and only auto-reserve slots on controllers appropriate for the given device - controller 0 is already inappropriate for PCI devices, and we can mark the dmi-to-pci-bridge type as being inappropriate for auto-reserve (since, if I recall correctly, I was told that you can't hotplug devices on that bus). So, all new PCI devices in the config will get addresses with bus='2'.
Of course this means that it will not be possible to switch an existing domain config from pc to q35 simply by changing the machinetype - the bus number in the address of all devices will need to be changed from 0 to 2. But this is another case of "opt in", and already requires editing the domain config anyway. If someone creates a brand new q35 machine though, all PCI devices will get added with bus='whatever is the bus number of the first pci-root or pci-bridge controller' (in this case, '2').
So, here are the proposed pci controller types cleaned up an re-summarized, followed by an example.
<controller type='pci'> =======================
This will be used for *all* of the following PCI controller devices supported by qemu:
<model type='pci-root'/> (implicit/integrated) ------------------------
Upstream: implicit connection to the host Downstream: 32 slots (slot 0 reserved), PCI devices only qemu commandline: nothing (implicit in the pc-* etc. machinetypes)
This controller represents a pc* (or rhel-*) machine's integrated PCI bus (pci.0) and provides places for PCI devices to connect (including the "pci-bridge" type of PCI controller).
There is only one of these controllers, and it will *always* be index='0', and will have no<address> element.
ok.
<model type='pcie-root'/> (implicit/integrated) -------------------------
Upstream: implicit connection to the host Downstream: 32 slots (slot 0 reserved), PCIe devices only, no hotplug. qemu commandline: nothing (implicit in the q35-* machinetype)
This controller represents a q35's PCI "root complex", and provides places for PCIe devices to connect. Examples are root-ports, dmi-to-pci-bridges sata controllers, integrated sound/usb/ethernet devices (do any of those integrated devices that can be connected to the pcie-root-bus exist yet?).
There is only one of these controllers, and it will *always* be index='0', and will have no<address> element.
ok.
<model type='root-port'/> (ioh3420) -------------------------
Upstream: PCIe, connect to pcie-root-bus *only* (?) Downstream: 1 slot, PCIe devices only (?) qemu commandline: -device ioh3420,...
These can only connect to the "pcie-root" of a q35. Any PCIe devices can connect to it, including an upstream-switch-port.
ioh on q35; ich9/10/xx for other intel chipsets
<model type='upstream-switch-port'/> (x3130-upstream) ------------------------------------
Upstream: PCIe, connect to pcie-root-bus, root-port, or downstream-switch-port (?) Downstream: 32 slots, connect *only* to downstream-switch-port qemu-commandline: -device x3130-upstream
This is the upper side of a switch that can multiplex multiple devices onto a single port. It's only useful when one or more downstream switch ports are connected to it.
<model type='downstream-switch-port'/> (xio3130-downstream) --------------------------------------
Upstream: connect *only* to upstream-switch-port Downstream: 1 slot, any PCIe device qemu commandline: -device xio3130-downstream
You can connect one or more of these to an upstream-switch-port in order to effectively plug multiple devices into a single PCIe port.
ugh! one cannot have 3130-downstream w/o 3130upstream; simplify: PCIe-PPB-up; PCIe-PPB-down -- then it can be anything (not TI, not IDT, not Intel, etc.).
<model type='dmi-to-pci-bridge'/> (i82801b11-bridge) ---------------------------------
(btw, what does "dmi" mean?)
Upstream: pcie-root *only* Downstream: 32 slots, any PCI device (including "pci-bridge"), no hotplug (?) qemu commandline: -device i82801b11-bridge,...
This is the gateway to the world of standard old PCI.
why needed?
<model type='pci-bridge'/> (pci-bridge) --------------------------
Upstream: PCI, connect to 1) pci-root, 2) dmi-to-pci-bridge 3) another pci-bridge Downstream: any PCI device, 32 slots qemu commandline: -device pci-bridge,...
This differs from dmi-to-pci-bridge in that its upstream connection is PCI rather than PCIe (so it will work on an i440FX system, which has a pci-root rather than pcie-root) and that hotplug is supported. In general, if a guest will have any PCI devices, one of these controllers should be added, and the PCI devices connected to it rather than to the dmi-to-pci-bridge.
************************************ (For q35, we *may* decide to always auto-add a dmi-to-pci-bridge at 00:1E.0, and a pci-bridge on slot 1 of the dmi-to-pci-bridge. This will allow a continuation of the tradition of simply adding new devices to the config without worrying about where they connect.)
============================================================================ Just to make sure this config model will work, here is the XML to replicate the layout (only the ones involved in the PCI tree, along with 3 ethernet devices as examples) of the X58 hardware I have sitting under my desk (I've attached lspci and virsh nodedev-list --tree output from that machine):
<controller type='pci' index='0'> <model type='pcie-root'/> </controller>
<controller type='pci' index='1'> <model type='root-port'/> <address type='pci' bus='0' slot='1'/> </controller>
( there is a scsi controller connected to bus='1')
<controller type='pci' index='2'> <model type='root-port'/> <address type='pci' bus='0' slot='3'/> </controller>
(the VGA controller is connected to bus='2')
<controller type='pci' index='3'> <model type='root-port'/> <address type='pci' bus='0' slot='7'/> </controller>
(PCIe SRIOV network card (in external PCIe slot) connected to bus='3')
<controller type='pci' index='4'> <model type='root-port'/> <address type='pci' bus='0' slot='0x1c' function='0'/> </controller>
(unused PCIe slot available on bus='4')
<!-- pcie-root (0:1c.4) -> root-port (5:0.0) -> onboard ethernet -> <controller type='pci' index='5'> <model type='root-port'/> <address type='pci' bus='0' slot='0x1c' function='4'/> </controller> <interface type='blah'> ... <mac address='00:27:13:53:db:76'/> <address type='pci' bus='5' slot='0' function='0'/> </interface>
<!-- more complicated connection to 2nd systemboard ethernet --> <!-- pcie-root ->(0:1c:5)root-port -> (6:0.0)upstream-switch-port -> (7:3.0)downstream-switch-port -> (9:0.0)ethernet --> <controller type='pci' index='6'> <model type='root-port'/> <address type='pci' bus='0' slot='0x1c' function='5'/> </controller> <controller type='pci' index='7'> <model type='upstream-switch-port'/> <address type='pci' bus='6' slot='0' function='0'/> </controller> <controller type='pci' index='8'> <model type='downstream-switch-port'/> <address type='pci' bus='7' slot='2' function='0'/> </controller> <controller type='pci' index='9'> <model type='downstream-switch-port'/> <address type='pci' bus='7' slot='3' function='0'/> </controller> <interface type='blah'> ... <mac address='00:27:13:53:db:77'/> <address type='pci' bus='9' slot='0' function='0'/> </interface>
<!-- old-fashioned PCI ethernet in an external PCI slot --> <controller type='pci' index='0x0a'> <model type='dmi-to-pci-bridge'/> <address type='pci' bus='0x1e' slot='0' function='0'/> </controller> <interface type='blah'> ... <mac address='00:03:47:7b:63:e6'/> <address type='pci' bus='0x0a' slot='0x0e' function='0'/> </interface>
So I think this will all work. Does anyone see any problems?
If not, then we can draw it all back to the *current* patchset - support for multiple PCI buses using the pci-bridge device. For *that*, we only need to implement the follow bits of the above:
1) There will be a new<controller type='pci'> device, with a<model type='xyz'/> subelement. Initially we will support types "pci-root" and "pci-bridge" (all the other types discussed above can be added later). pci-root will have *no<address>* element (and will generate nothing on the qemu commandline, but will create a 32 slot "bus='0'" to plug PCI devices into). pci-bridge will have an<address> element, will generate a -device option on the qemu commandline, and will also create a 32 slot "bus='n'" to plug PCI devices into.
2) for machinetypes that have a PCI bus, the config should have this controller auto-added:
<controller type='pci'> <model type='pci-root'/> </controller>
This will make bus='0' available (but add nothing to the qemu commandline). Any attempt to add a PCI device when there is no bus available should be an error.
3) The way to add more buses will be to add a controller like this:
<controller type='pci'> <model type='pci-bridge'/> </controller>
for legacy PCI, yes; but for PCIe, one needs PCIe-PPB-up & at least one PCI-PPB-down One _cannot_ have just a single pci-bridge except as driving bridge from a root-complex port.
4) When<controller type='usb'> was added, resulting in auto-generated devices, that caused problems when migrating from a host with newer libvirt to one with older libvirt. We need to make sure we don't suffer the same problem this time. See the following two BZes for details (unless you have a better memory than me! :-):
https://bugzilla.redhat.com/show_bug.cgi?id=815503 https://bugzilla.redhat.com/show_bug.cgi?id=856864
(and note how danpb eerily prophesied the current pending situation :-)
I think everything else about Jan's/Liguang's pci-bridge patches can remain.

On 04/15/2013 06:14 PM, Don Dutile wrote:
On 04/15/2013 04:09 PM, Laine Stump wrote:
On 04/15/2013 06:29 AM, Daniel P. Berrange wrote:
On 04/11/2013 07:23 AM, Michael S. Tsirkin wrote:
On Thu, Apr 11, 2013 at 07:03:56AM -0400, Laine Stump wrote:
On 04/10/2013 05:26 AM, Daniel P. Berrange wrote: >>> So if we later allowed for mutiple PCI roots, then we'd have >>> something >>> like >>> >>> <controller type="pci-root" index="0"> >>> <model name="i440FX"/> >>> </controller> >>> <controller type="pci-root" index="1"> >>> <model name="i440FX"/> >>> </controller> >>> <controller type="pci" index="0"> <!-- Host bridge 1 --> >>> <address type='pci' domain='0' bus='0' slot='0''/> >>> </controller> >>> <controller type="pci" index="0"> <!-- Host bridge 2 --> >>> <address type='pci' domain='1' bus='0' slot='0''/> >>> </controller>
There is a problem here - within a given controller type, we will now have the possibility of multiple controllers with the same index - the differentiating attribute will be in the<address> subelement, which could create some awkwardness. Maybe instead this should be handled with a different model of pci controller, and we can add a "domain" attribute at the toplevel rather than specifying an<address>? IIUC there is a limit on the number of PCI buses you can create per domain, due to fixed size of PCI addresses. Google suggests to me
On Fri, Apr 12, 2013 at 11:46:15AM -0400, Laine Stump wrote: the limit is 256. So for domain 1, we could just start index at 256, and domain 2 at 512, etc
Okay. Whether we choose that method, or a separate domain attribute, I'm satisfied that we'll be able to find a way to solve it when the time comes (and it hasn't yet), so we can ignore that problem for now.
*PLEASE* don't create a new/competing naming/numbering scheme for differentiating PCI domains.... as much as I dislike the overuse of the term 'domain', it's what is used. No sane person is going to look to assign PCI bus numbers > 256 in order to get new/different domains. The name sucks, but that's what it's called in the code, and what customers are use to.
I infer from this that you're okay with: <controller type='pci' domain='n' index='n'> when defining a new controller (using "index" instead of "bus" is a bit bothersome, but it is following current convention; should we reconsider and just call it "bus" this time?), and: <address domain='n bus='n' slot='n' function='n'/> For specifying where to connect a PCI/PCIe device (including PCI[e] controllers).
Comment: I'm not quite convinced that we really need the separate "pci-root" device. Since 1) every pci-root will *always* have either a pcie-root-bus or a pci-bridge connected to it, 2) the pci-root-bus will only ever be connected to the pci-root, and 3) the pci-bridge that connects to it will need special handling within the pci-bridge case anyway, why not:
1) eliminate the separate pci-root controller type Ok, lets leave it out - we can always add it later if desired.
Okay.
Not so fast.... something that represents the PCI Root Complex might be handy -- error handling and embedded devices (like IOMMUs, intr-remapping table) come to mind... ACPI tables (if they get duped from real systems) may need unconventional naming schemes for qemu if an RC isn't modelled.
If I understand it correctly, that's what I intend <model type='pcie-root'/> to be - it is what shows up as "bus 0" in the output of lspci. Everything, including embedded/integrated devices, is shown as connected to domain 0 bus 0, isn't it? What Dan had originally suggested was a separate "root" that only specified "i440FX" or "q35", and that the devices I'm naming "pci-root" and "pcie-root" in this latest draft would just magically connect to that "root" by giving their own connection address as "slot='0'" of their own bus. That seemed redundant to I suggested removing it and just having a special kind of PCI controller that had no connection address.
2) within<controller type='pci'>, a new<model type='pci-root-bus'/> will be added.
3) a pcie-root-bus will automatically be added for q35 machinetypes, and pci-root-bus for any machinetype that supports a PCI bus (e.g. "pc-*")
4) model type='pci-root-bus' will behave like pci-bridge, except that it will be an implicit device (nothing on qemu commandline) and it won't need an<address> element (neither will pcie-root-bus).
5) to support multiple domains, we can simply add a "domain" attribute to the toplevel of controller. Or use index numbers modulo 256 to identify domain numbers.
Right. One or the other. But we can defer that discussion.
Just say 'domain' .... again! ;-)
Okay, I'm on board with you (and we have quite awhile to convince anyone who isn't :-)
One note on q35 - we need to make sure whatever we do in terms of creating default<controller>s in the XML 'just works' for applications. eg if they define a guest using<type machine="q35">hvm</type>, and then add a <interface>, it should do the right thing wrt PCI addressing/connectivity. We must not require applications to manually add<controller> elements for q35 for things to work. Adding<controller>s must purely be an opt-in for apps which have the detailed knowledge rquired& need full control over bus layout.
Yep. What I see happening is that the place where we currently add default controllers will, in the future, automatically add this for machinetype pc* and rhel-*:
<controller type='pci'> <!-- implied index='0' --> <model type='pci-root'/> </controller>
and for machinetype q35* it will add (something like):
<controller type='pci'> <!-- index='0' --> <model type='pcie-root'/> </controller> <controller type='pci'> <!-- index='1' --> <model type='dmi-to-pci-bridge'/> <address type='pci' bus='0' slot='0x1e'/> </controller> <controller type='pci'> <!-- index='2' --> <model type='pci-bridge'> <address type='pci' bus='1' slot='1'/> </controller>
The slot-auto-reserve code will look through all pci controllers and only auto-reserve slots on controllers appropriate for the given device - controller 0 is already inappropriate for PCI devices, and we can mark the dmi-to-pci-bridge type as being inappropriate for auto-reserve (since, if I recall correctly, I was told that you can't hotplug devices on that bus). So, all new PCI devices in the config will get addresses with bus='2'.
Of course this means that it will not be possible to switch an existing domain config from pc to q35 simply by changing the machinetype - the bus number in the address of all devices will need to be changed from 0 to 2. But this is another case of "opt in", and already requires editing the domain config anyway. If someone creates a brand new q35 machine though, all PCI devices will get added with bus='whatever is the bus number of the first pci-root or pci-bridge controller' (in this case, '2').
So, here are the proposed pci controller types cleaned up an re-summarized, followed by an example.
<controller type='pci'> =======================
This will be used for *all* of the following PCI controller devices supported by qemu:
<model type='pci-root'/> (implicit/integrated) ------------------------
Upstream: implicit connection to the host Downstream: 32 slots (slot 0 reserved), PCI devices only qemu commandline: nothing (implicit in the pc-* etc. machinetypes)
This controller represents a pc* (or rhel-*) machine's integrated PCI bus (pci.0) and provides places for PCI devices to connect (including the "pci-bridge" type of PCI controller).
There is only one of these controllers, and it will *always* be index='0', and will have no<address> element.
ok.
<model type='pcie-root'/> (implicit/integrated) -------------------------
Upstream: implicit connection to the host Downstream: 32 slots (slot 0 reserved), PCIe devices only, no hotplug. qemu commandline: nothing (implicit in the q35-* machinetype)
This controller represents a q35's PCI "root complex", and provides places for PCIe devices to connect. Examples are root-ports, dmi-to-pci-bridges sata controllers, integrated sound/usb/ethernet devices (do any of those integrated devices that can be connected to the pcie-root-bus exist yet?).
There is only one of these controllers, and it will *always* be index='0', and will have no<address> element.
ok.
<model type='root-port'/> (ioh3420) -------------------------
Upstream: PCIe, connect to pcie-root-bus *only* (?) Downstream: 1 slot, PCIe devices only (?) qemu commandline: -device ioh3420,...
These can only connect to the "pcie-root" of a q35. Any PCIe devices can connect to it, including an upstream-switch-port.
ioh on q35; ich9/10/xx for other intel chipsets
<model type='upstream-switch-port'/> (x3130-upstream) ------------------------------------
Upstream: PCIe, connect to pcie-root-bus, root-port, or downstream-switch-port (?) Downstream: 32 slots, connect *only* to downstream-switch-port qemu-commandline: -device x3130-upstream
This is the upper side of a switch that can multiplex multiple devices onto a single port. It's only useful when one or more downstream switch ports are connected to it.
<model type='downstream-switch-port'/> (xio3130-downstream) --------------------------------------
Upstream: connect *only* to upstream-switch-port Downstream: 1 slot, any PCIe device qemu commandline: -device xio3130-downstream
You can connect one or more of these to an upstream-switch-port in order to effectively plug multiple devices into a single PCIe port.
ugh! one cannot have 3130-downstream w/o 3130upstream; simplify: PCIe-PPB-up; PCIe-PPB-down -- then it can be anything (not TI, not IDT, not Intel, etc.).
Are you just saying that the upstream and downstream ports must be a matching set? That was the intent with naming them (up|down)stream-switch-port - since I've been led to believe that all the different chipsets end up providing the same externally-visible functionality (as long as all the building blocks are used from the same set), I've just used functional names rather than specific chipset/device names, so that the appropriate choice can be made at domain start time, based on what's available in the qemu being run (or the machine type that is chosen). (the "qemu commandline" line merely tells what would be used to implement this controller *with existing qemu 1.4 devices*) (Hmmm - do you think that changing something like the type of upstream and downstream switch ports would lead to Windows requiring a license re-activation? If so, we may need to rethink this and hardcode the specific device that's used :-/)
<model type='dmi-to-pci-bridge'/> (i82801b11-bridge) ---------------------------------
Based on Alex's feedback, do we maybe want to name this device "pcie-to-pci-bridge" instead? (or maybe just "pcie-to-pci"?)
(btw, what does "dmi" mean?)
Upstream: pcie-root *only* Downstream: 32 slots, any PCI device (including "pci-bridge"), no hotplug (?) qemu commandline: -device i82801b11-bridge,...
This is the gateway to the world of standard old PCI.
why needed?
My understanding is that this is the only type of bridge that can be directly connected to pcie-root (the "root complex") and provide plain old PCI slots.
<model type='pci-bridge'/> (pci-bridge) --------------------------
For consistency with the above "pcie-to-pci-bridge" type, should we name this "pci-to-pci-bridge" instead? (or maybe just "pci-to-pci"?)
Upstream: PCI, connect to 1) pci-root, 2) dmi-to-pci-bridge 3) another pci-bridge Downstream: any PCI device, 32 slots qemu commandline: -device pci-bridge,...
This differs from dmi-to-pci-bridge in that its upstream connection is PCI rather than PCIe (so it will work on an i440FX system, which has a pci-root rather than pcie-root) and that hotplug is supported. In general, if a guest will have any PCI devices, one of these controllers should be added, and the PCI devices connected to it rather than to the dmi-to-pci-bridge.
************************************ (For q35, we *may* decide to always auto-add a dmi-to-pci-bridge at 00:1E.0, and a pci-bridge on slot 1 of the dmi-to-pci-bridge. This will allow a continuation of the tradition of simply adding new devices to the config without worrying about where they connect.)
============================================================================
Just to make sure this config model will work, here is the XML to replicate the layout (only the ones involved in the PCI tree, along with 3 ethernet devices as examples) of the X58 hardware I have sitting under my desk (I've attached lspci and virsh nodedev-list --tree output from that machine):
<controller type='pci' index='0'> <model type='pcie-root'/> </controller>
<controller type='pci' index='1'> <model type='root-port'/> <address type='pci' bus='0' slot='1'/> </controller>
( there is a scsi controller connected to bus='1')
<controller type='pci' index='2'> <model type='root-port'/> <address type='pci' bus='0' slot='3'/> </controller>
(the VGA controller is connected to bus='2')
<controller type='pci' index='3'> <model type='root-port'/> <address type='pci' bus='0' slot='7'/> </controller>
(PCIe SRIOV network card (in external PCIe slot) connected to bus='3')
<controller type='pci' index='4'> <model type='root-port'/> <address type='pci' bus='0' slot='0x1c' function='0'/> </controller>
(unused PCIe slot available on bus='4')
<!-- pcie-root (0:1c.4) -> root-port (5:0.0) -> onboard ethernet -> <controller type='pci' index='5'> <model type='root-port'/> <address type='pci' bus='0' slot='0x1c' function='4'/> </controller> <interface type='blah'> ... <mac address='00:27:13:53:db:76'/> <address type='pci' bus='5' slot='0' function='0'/> </interface>
<!-- more complicated connection to 2nd systemboard ethernet --> <!-- pcie-root ->(0:1c:5)root-port -> (6:0.0)upstream-switch-port -> (7:3.0)downstream-switch-port -> (9:0.0)ethernet --> <controller type='pci' index='6'> <model type='root-port'/> <address type='pci' bus='0' slot='0x1c' function='5'/> </controller> <controller type='pci' index='7'> <model type='upstream-switch-port'/> <address type='pci' bus='6' slot='0' function='0'/> </controller> <controller type='pci' index='8'> <model type='downstream-switch-port'/> <address type='pci' bus='7' slot='2' function='0'/> </controller> <controller type='pci' index='9'> <model type='downstream-switch-port'/> <address type='pci' bus='7' slot='3' function='0'/> </controller> <interface type='blah'> ... <mac address='00:27:13:53:db:77'/> <address type='pci' bus='9' slot='0' function='0'/> </interface>
<!-- old-fashioned PCI ethernet in an external PCI slot --> <controller type='pci' index='0x0a'> <model type='dmi-to-pci-bridge'/> <address type='pci' bus='0x1e' slot='0' function='0'/> </controller> <interface type='blah'> ... <mac address='00:03:47:7b:63:e6'/> <address type='pci' bus='0x0a' slot='0x0e' function='0'/> </interface>
So I think this will all work. Does anyone see any problems?
If not, then we can draw it all back to the *current* patchset - support for multiple PCI buses using the pci-bridge device. For *that*, we only need to implement the follow bits of the above:
1) There will be a new<controller type='pci'> device, with a<model type='xyz'/> subelement. Initially we will support types "pci-root" and "pci-bridge" (all the other types discussed above can be added later). pci-root will have *no<address>* element (and will generate nothing on the qemu commandline, but will create a 32 slot "bus='0'" to plug PCI devices into). pci-bridge will have an<address> element, will generate a -device option on the qemu commandline, and will also create a 32 slot "bus='n'" to plug PCI devices into.
2) for machinetypes that have a PCI bus, the config should have this controller auto-added:
<controller type='pci'> <model type='pci-root'/> </controller>
This will make bus='0' available (but add nothing to the qemu commandline). Any attempt to add a PCI device when there is no bus available should be an error.
3) The way to add more buses will be to add a controller like this:
<controller type='pci'> <model type='pci-bridge'/> </controller>
for legacy PCI, yes;
Right. When I said "current patchset", that's what I was referring to. I started this whole design discussion as a sub-thread of a patcheset that just adds support for the "pci-bridge" device to the existing pc-*/rhel-* machinetypes. I only got into the discussion of all the PCIe controller types because I wanted to make sure that the new controller type we added to support that would be logically expandable to support all of the controllers needed for q35/PCIe support (which is in the queue, but will happen later).
but for PCIe, one needs PCIe-PPB-up & at least one PCI-PPB-down One _cannot_ have just a single pci-bridge except as driving bridge from a root-complex port.
I had thought that the possibilities for a bridge that provides a legacy PCI bus were: 1) create an i82801b11-bridge device connected to the root complex (or to any root-port or downstream-switch-port). This new bus will provide 32 legacy PCI slots (slot 0 reserved), but devices (currently) can't be hot-plugged. 2) create a pci-bridge device connected to any existing legacy PCI slot (e.g. one of the slots of an i82801b11-bridge). This new bus will provide 32 legacy PCI slots (slot 0 reserved) and hot-plug *is* supported. The (up|down)stream-switch-port based "buses" provide PCIe slots, not PCI.
4) When<controller type='usb'> was added, resulting in auto-generated devices, that caused problems when migrating from a host with newer libvirt to one with older libvirt. We need to make sure we don't suffer the same problem this time. See the following two BZes for details (unless you have a better memory than me! :-):
https://bugzilla.redhat.com/show_bug.cgi?id=815503 https://bugzilla.redhat.com/show_bug.cgi?id=856864
(and note how danpb eerily prophesied the current pending situation :-)
I think everything else about Jan's/Liguang's pci-bridge patches can remain.

On Tue, Apr 16, 2013 at 12:35:29PM -0400, Laine Stump wrote:
On 04/15/2013 06:14 PM, Don Dutile wrote:
On 04/15/2013 04:09 PM, Laine Stump wrote:
On 04/15/2013 06:29 AM, Daniel P. Berrange wrote:
On 04/11/2013 07:23 AM, Michael S. Tsirkin wrote:
On Thu, Apr 11, 2013 at 07:03:56AM -0400, Laine Stump wrote: > On 04/10/2013 05:26 AM, Daniel P. Berrange wrote: >>>> So if we later allowed for mutiple PCI roots, then we'd have >>>> something >>>> like >>>> >>>> <controller type="pci-root" index="0"> >>>> <model name="i440FX"/> >>>> </controller> >>>> <controller type="pci-root" index="1"> >>>> <model name="i440FX"/> >>>> </controller> >>>> <controller type="pci" index="0"> <!-- Host bridge 1 --> >>>> <address type='pci' domain='0' bus='0' slot='0''/> >>>> </controller> >>>> <controller type="pci" index="0"> <!-- Host bridge 2 --> >>>> <address type='pci' domain='1' bus='0' slot='0''/> >>>> </controller>
There is a problem here - within a given controller type, we will now have the possibility of multiple controllers with the same index - the differentiating attribute will be in the<address> subelement, which could create some awkwardness. Maybe instead this should be handled with a different model of pci controller, and we can add a "domain" attribute at the toplevel rather than specifying an<address>? IIUC there is a limit on the number of PCI buses you can create per domain, due to fixed size of PCI addresses. Google suggests to me
On Fri, Apr 12, 2013 at 11:46:15AM -0400, Laine Stump wrote: the limit is 256. So for domain 1, we could just start index at 256, and domain 2 at 512, etc
Okay. Whether we choose that method, or a separate domain attribute, I'm satisfied that we'll be able to find a way to solve it when the time comes (and it hasn't yet), so we can ignore that problem for now.
*PLEASE* don't create a new/competing naming/numbering scheme for differentiating PCI domains.... as much as I dislike the overuse of the term 'domain', it's what is used. No sane person is going to look to assign PCI bus numbers > 256 in order to get new/different domains. The name sucks, but that's what it's called in the code, and what customers are use to
I infer from this that you're okay with:
<controller type='pci' domain='n' index='n'>
when defining a new controller (using "index" instead of "bus" is a bit bothersome, but it is following current convention; should we reconsider and just call it "bus" this time?), and:
'index' is standard naming across all libvirt <controller> elements and we should do anything different for PCI. For the same reason I don't want us inventing a new 'domain' attribute here either. Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On Tue, Apr 16, 2013 at 12:35:29PM -0400, Laine Stump wrote:
On 04/15/2013 06:14 PM, Don Dutile wrote:
On 04/15/2013 04:09 PM, Laine Stump wrote:
On 04/15/2013 06:29 AM, Daniel P. Berrange wrote:
On 04/11/2013 07:23 AM, Michael S. Tsirkin wrote:
On Thu, Apr 11, 2013 at 07:03:56AM -0400, Laine Stump wrote: > On 04/10/2013 05:26 AM, Daniel P. Berrange wrote: >>>> So if we later allowed for mutiple PCI roots, then we'd have >>>> something >>>> like >>>> >>>> <controller type="pci-root" index="0"> >>>> <model name="i440FX"/> >>>> </controller> >>>> <controller type="pci-root" index="1"> >>>> <model name="i440FX"/> >>>> </controller> >>>> <controller type="pci" index="0"> <!-- Host bridge 1 --> >>>> <address type='pci' domain='0' bus='0' slot='0''/> >>>> </controller> >>>> <controller type="pci" index="0"> <!-- Host bridge 2 --> >>>> <address type='pci' domain='1' bus='0' slot='0''/> >>>> </controller>
There is a problem here - within a given controller type, we will now have the possibility of multiple controllers with the same index - the differentiating attribute will be in the<address> subelement, which could create some awkwardness. Maybe instead this should be handled with a different model of pci controller, and we can add a "domain" attribute at the toplevel rather than specifying an<address>? IIUC there is a limit on the number of PCI buses you can create per domain, due to fixed size of PCI addresses. Google suggests to me
On Fri, Apr 12, 2013 at 11:46:15AM -0400, Laine Stump wrote: the limit is 256. So for domain 1, we could just start index at 256, and domain 2 at 512, etc
Okay. Whether we choose that method, or a separate domain attribute, I'm satisfied that we'll be able to find a way to solve it when the time comes (and it hasn't yet), so we can ignore that problem for now.
*PLEASE* don't create a new/competing naming/numbering scheme for differentiating PCI domains.... as much as I dislike the overuse of the term 'domain', it's what is used. No sane person is going to look to assign PCI bus numbers > 256 in order to get new/different domains. The name sucks, but that's what it's called in the code, and what customers are use to.
I infer from this that you're okay with:
<controller type='pci' domain='n' index='n'>
when defining a new controller (using "index" instead of "bus" is a bit bothersome, but it is following current convention; should we reconsider and just call it "bus" this time?), and:
<address domain='n bus='n' slot='n' function='n'/>
For specifying where to connect a PCI/PCIe device (including PCI[e] controllers).
I think I prefer index to bus, since bus would make people think it's the bus number. I'm still confused as to what does domain mean exactly. If you identify where device is connected using the index value, why do you need a separate domain?
Comment: I'm not quite convinced that we really need the separate "pci-root" device. Since 1) every pci-root will *always* have either a pcie-root-bus or a pci-bridge connected to it, 2) the pci-root-bus will only ever be connected to the pci-root, and 3) the pci-bridge that connects to it will need special handling within the pci-bridge case anyway, why not:
1) eliminate the separate pci-root controller type Ok, lets leave it out - we can always add it later if desired.
Okay.
Not so fast.... something that represents the PCI Root Complex might be handy -- error handling and embedded devices (like IOMMUs, intr-remapping table) come to mind... ACPI tables (if they get duped from real systems) may need unconventional naming schemes for qemu if an RC isn't modelled.
If I understand it correctly, that's what I intend <model type='pcie-root'/> to be - it is what shows up as "bus 0" in the output of lspci. Everything, including embedded/integrated devices, is shown as connected to domain 0 bus 0, isn't it?
There does not have to be a bus 0 in the output of lspci, even though current guests enumerate bus numbers from 0 so there currently usually is one, this might not hold in the future. I am not sure whether libvirt uses any magic values itself but I think it's preferable not to have magic numbers. In fact, we only use numbers for the index field for compatibility.
What Dan had originally suggested was a separate "root" that only specified "i440FX" or "q35", and that the devices I'm naming "pci-root" and "pcie-root" in this latest draft would just magically connect to that "root" by giving their own connection address as "slot='0'" of their own bus. That seemed redundant to I suggested removing it and just having a special kind of PCI controller that had no connection address.
Maybe for the root it makes sense to optionally specify domain instead of index in the address.
2) within<controller type='pci'>, a new<model type='pci-root-bus'/> will be added.
3) a pcie-root-bus will automatically be added for q35 machinetypes, and pci-root-bus for any machinetype that supports a PCI bus (e.g. "pc-*")
4) model type='pci-root-bus' will behave like pci-bridge, except that it will be an implicit device (nothing on qemu commandline) and it won't need an<address> element (neither will pcie-root-bus).
5) to support multiple domains, we can simply add a "domain" attribute to the toplevel of controller. Or use index numbers modulo 256 to identify domain numbers.
Right. One or the other. But we can defer that discussion.
Just say 'domain' .... again! ;-)
Okay, I'm on board with you (and we have quite awhile to convince anyone who isn't :-)
Looks sane. So there shouldn't be any need for the domain attribute anywhere as long as you only have one root, right?
One note on q35 - we need to make sure whatever we do in terms of creating default<controller>s in the XML 'just works' for applications. eg if they define a guest using<type machine="q35">hvm</type>, and then add a <interface>, it should do the right thing wrt PCI addressing/connectivity. We must not require applications to manually add<controller> elements for q35 for things to work. Adding<controller>s must purely be an opt-in for apps which have the detailed knowledge rquired& need full control over bus layout.
Yep. What I see happening is that the place where we currently add default controllers will, in the future, automatically add this for machinetype pc* and rhel-*:
<controller type='pci'> <!-- implied index='0' --> <model type='pci-root'/> </controller>
and for machinetype q35* it will add (something like):
<controller type='pci'> <!-- index='0' --> <model type='pcie-root'/> </controller> <controller type='pci'> <!-- index='1' --> <model type='dmi-to-pci-bridge'/> <address type='pci' bus='0' slot='0x1e'/> </controller> <controller type='pci'> <!-- index='2' --> <model type='pci-bridge'> <address type='pci' bus='1' slot='1'/> </controller>
The slot-auto-reserve code will look through all pci controllers and only auto-reserve slots on controllers appropriate for the given device - controller 0 is already inappropriate for PCI devices, and we can mark the dmi-to-pci-bridge type as being inappropriate for auto-reserve (since, if I recall correctly, I was told that you can't hotplug devices on that bus). So, all new PCI devices in the config will get addresses with bus='2'.
Of course this means that it will not be possible to switch an existing domain config from pc to q35 simply by changing the machinetype - the bus number in the address of all devices will need to be changed from 0 to 2. But this is another case of "opt in", and already requires editing the domain config anyway. If someone creates a brand new q35 machine though, all PCI devices will get added with bus='whatever is the bus number of the first pci-root or pci-bridge controller' (in this case, '2').
So, here are the proposed pci controller types cleaned up an re-summarized, followed by an example.
<controller type='pci'> =======================
This will be used for *all* of the following PCI controller devices supported by qemu:
<model type='pci-root'/> (implicit/integrated) ------------------------
Upstream: implicit connection to the host Downstream: 32 slots (slot 0 reserved), PCI devices only qemu commandline: nothing (implicit in the pc-* etc. machinetypes)
This controller represents a pc* (or rhel-*) machine's integrated PCI bus (pci.0) and provides places for PCI devices to connect (including the "pci-bridge" type of PCI controller).
There is only one of these controllers, and it will *always* be index='0', and will have no<address> element.
ok.
<model type='pcie-root'/> (implicit/integrated) -------------------------
Upstream: implicit connection to the host Downstream: 32 slots (slot 0 reserved), PCIe devices only, no hotplug. qemu commandline: nothing (implicit in the q35-* machinetype)
This controller represents a q35's PCI "root complex", and provides places for PCIe devices to connect. Examples are root-ports, dmi-to-pci-bridges sata controllers, integrated sound/usb/ethernet devices (do any of those integrated devices that can be connected to the pcie-root-bus exist yet?).
There is only one of these controllers, and it will *always* be index='0', and will have no<address> element.
ok.
<model type='root-port'/> (ioh3420) -------------------------
Upstream: PCIe, connect to pcie-root-bus *only* (?) Downstream: 1 slot, PCIe devices only (?) qemu commandline: -device ioh3420,...
These can only connect to the "pcie-root" of a q35. Any PCIe devices can connect to it, including an upstream-switch-port.
ioh on q35; ich9/10/xx for other intel chipsets
<model type='upstream-switch-port'/> (x3130-upstream) ------------------------------------
Upstream: PCIe, connect to pcie-root-bus, root-port, or downstream-switch-port (?) Downstream: 32 slots, connect *only* to downstream-switch-port qemu-commandline: -device x3130-upstream
This is the upper side of a switch that can multiplex multiple devices onto a single port. It's only useful when one or more downstream switch ports are connected to it.
<model type='downstream-switch-port'/> (xio3130-downstream) --------------------------------------
Upstream: connect *only* to upstream-switch-port Downstream: 1 slot, any PCIe device qemu commandline: -device xio3130-downstream
You can connect one or more of these to an upstream-switch-port in order to effectively plug multiple devices into a single PCIe port.
ugh! one cannot have 3130-downstream w/o 3130upstream; simplify: PCIe-PPB-up; PCIe-PPB-down -- then it can be anything (not TI, not IDT, not Intel, etc.).
Are you just saying that the upstream and downstream ports must be a matching set? That was the intent with naming them (up|down)stream-switch-port - since I've been led to believe that all the different chipsets end up providing the same externally-visible functionality (as long as all the building blocks are used from the same set), I've just used functional names rather than specific chipset/device names, so that the appropriate choice can be made at domain start time, based on what's available in the qemu being run (or the machine type that is chosen).
(the "qemu commandline" line merely tells what would be used to implement this controller *with existing qemu 1.4 devices*)
(Hmmm - do you think that changing something like the type of upstream and downstream switch ports would lead to Windows requiring a license re-activation? If so, we may need to rethink this and hardcode the specific device that's used :-/)
<model type='dmi-to-pci-bridge'/> (i82801b11-bridge) ---------------------------------
Based on Alex's feedback, do we maybe want to name this device "pcie-to-pci-bridge" instead? (or maybe just "pcie-to-pci"?)
(btw, what does "dmi" mean?)
Upstream: pcie-root *only* Downstream: 32 slots, any PCI device (including "pci-bridge"), no hotplug (?) qemu commandline: -device i82801b11-bridge,...
This is the gateway to the world of standard old PCI.
why needed?
My understanding is that this is the only type of bridge that can be directly connected to pcie-root (the "root complex") and provide plain old PCI slots.
<model type='pci-bridge'/> (pci-bridge) --------------------------
For consistency with the above "pcie-to-pci-bridge" type, should we name this "pci-to-pci-bridge" instead? (or maybe just "pci-to-pci"?)
Upstream: PCI, connect to 1) pci-root, 2) dmi-to-pci-bridge 3) another pci-bridge Downstream: any PCI device, 32 slots qemu commandline: -device pci-bridge,...
This differs from dmi-to-pci-bridge in that its upstream connection is PCI rather than PCIe (so it will work on an i440FX system, which has a pci-root rather than pcie-root) and that hotplug is supported. In general, if a guest will have any PCI devices, one of these controllers should be added, and the PCI devices connected to it rather than to the dmi-to-pci-bridge.
************************************ (For q35, we *may* decide to always auto-add a dmi-to-pci-bridge at 00:1E.0, and a pci-bridge on slot 1 of the dmi-to-pci-bridge. This will allow a continuation of the tradition of simply adding new devices to the config without worrying about where they connect.)
============================================================================
Just to make sure this config model will work, here is the XML to replicate the layout (only the ones involved in the PCI tree, along with 3 ethernet devices as examples) of the X58 hardware I have sitting under my desk (I've attached lspci and virsh nodedev-list --tree output from that machine):
<controller type='pci' index='0'> <model type='pcie-root'/> </controller>
<controller type='pci' index='1'> <model type='root-port'/> <address type='pci' bus='0' slot='1'/> </controller>
( there is a scsi controller connected to bus='1')
<controller type='pci' index='2'> <model type='root-port'/> <address type='pci' bus='0' slot='3'/> </controller>
(the VGA controller is connected to bus='2')
<controller type='pci' index='3'> <model type='root-port'/> <address type='pci' bus='0' slot='7'/> </controller>
(PCIe SRIOV network card (in external PCIe slot) connected to bus='3')
<controller type='pci' index='4'> <model type='root-port'/> <address type='pci' bus='0' slot='0x1c' function='0'/> </controller>
(unused PCIe slot available on bus='4')
<!-- pcie-root (0:1c.4) -> root-port (5:0.0) -> onboard ethernet -> <controller type='pci' index='5'> <model type='root-port'/> <address type='pci' bus='0' slot='0x1c' function='4'/> </controller> <interface type='blah'> ... <mac address='00:27:13:53:db:76'/> <address type='pci' bus='5' slot='0' function='0'/> </interface>
<!-- more complicated connection to 2nd systemboard ethernet --> <!-- pcie-root ->(0:1c:5)root-port -> (6:0.0)upstream-switch-port -> (7:3.0)downstream-switch-port -> (9:0.0)ethernet --> <controller type='pci' index='6'> <model type='root-port'/> <address type='pci' bus='0' slot='0x1c' function='5'/> </controller> <controller type='pci' index='7'> <model type='upstream-switch-port'/> <address type='pci' bus='6' slot='0' function='0'/> </controller> <controller type='pci' index='8'> <model type='downstream-switch-port'/> <address type='pci' bus='7' slot='2' function='0'/> </controller> <controller type='pci' index='9'> <model type='downstream-switch-port'/> <address type='pci' bus='7' slot='3' function='0'/> </controller> <interface type='blah'> ... <mac address='00:27:13:53:db:77'/> <address type='pci' bus='9' slot='0' function='0'/> </interface>
<!-- old-fashioned PCI ethernet in an external PCI slot --> <controller type='pci' index='0x0a'> <model type='dmi-to-pci-bridge'/> <address type='pci' bus='0x1e' slot='0' function='0'/> </controller> <interface type='blah'> ... <mac address='00:03:47:7b:63:e6'/> <address type='pci' bus='0x0a' slot='0x0e' function='0'/> </interface>
So I think this will all work. Does anyone see any problems?
If not, then we can draw it all back to the *current* patchset - support for multiple PCI buses using the pci-bridge device. For *that*, we only need to implement the follow bits of the above:
1) There will be a new<controller type='pci'> device, with a<model type='xyz'/> subelement. Initially we will support types "pci-root" and "pci-bridge" (all the other types discussed above can be added later). pci-root will have *no<address>* element (and will generate nothing on the qemu commandline, but will create a 32 slot "bus='0'" to plug PCI devices into). pci-bridge will have an<address> element, will generate a -device option on the qemu commandline, and will also create a 32 slot "bus='n'" to plug PCI devices into.
2) for machinetypes that have a PCI bus, the config should have this controller auto-added:
<controller type='pci'> <model type='pci-root'/> </controller>
This will make bus='0' available (but add nothing to the qemu commandline). Any attempt to add a PCI device when there is no bus available should be an error.
3) The way to add more buses will be to add a controller like this:
<controller type='pci'> <model type='pci-bridge'/> </controller>
for legacy PCI, yes;
Right. When I said "current patchset", that's what I was referring to. I started this whole design discussion as a sub-thread of a patcheset that just adds support for the "pci-bridge" device to the existing pc-*/rhel-* machinetypes. I only got into the discussion of all the PCIe controller types because I wanted to make sure that the new controller type we added to support that would be logically expandable to support all of the controllers needed for q35/PCIe support (which is in the queue, but will happen later).
but for PCIe, one needs PCIe-PPB-up & at least one PCI-PPB-down One _cannot_ have just a single pci-bridge except as driving bridge from a root-complex port.
I had thought that the possibilities for a bridge that provides a legacy PCI bus were:
1) create an i82801b11-bridge device connected to the root complex (or to any root-port or downstream-switch-port). This new bus will provide 32 legacy PCI slots (slot 0 reserved), but devices (currently) can't be hot-plugged.
2) create a pci-bridge device connected to any existing legacy PCI slot (e.g. one of the slots of an i82801b11-bridge). This new bus will provide 32 legacy PCI slots (slot 0 reserved) and hot-plug *is* supported.
The (up|down)stream-switch-port based "buses" provide PCIe slots, not PCI.
4) When<controller type='usb'> was added, resulting in auto-generated devices, that caused problems when migrating from a host with newer libvirt to one with older libvirt. We need to make sure we don't suffer the same problem this time. See the following two BZes for details (unless you have a better memory than me! :-):
https://bugzilla.redhat.com/show_bug.cgi?id=815503 https://bugzilla.redhat.com/show_bug.cgi?id=856864
(and note how danpb eerily prophesied the current pending situation :-)
I think everything else about Jan's/Liguang's pci-bridge patches can remain.

On Fri, 2013-04-12 at 11:46 -0400, Laine Stump wrote:
On 04/11/2013 07:23 AM, Michael S. Tsirkin wrote:
On Thu, Apr 11, 2013 at 07:03:56AM -0400, Laine Stump wrote:
On 04/10/2013 05:26 AM, Daniel P. Berrange wrote:
On Tue, Apr 09, 2013 at 04:06:06PM -0400, Laine Stump wrote:
On Mon, Apr 08, 2013 at 03:32:07PM -0400, Laine Stump wrote: Actually I do wonder if we should reprent a PCI root as two <controller> elements, one representing the actual PCI root device, and the other representing the host bridge that is built-in.
Also we should use the actual model names, not 'pci-root' or 'pcie-root' but rather i440FX for "pc" machine type, and whatever the q35 model name is.
- One PCI root with built-in PCI bus (ie todays' setup)
<controller type="pci-root" index="0"> <model name="i440FX"/> </controller> <controller type="pci" index="0"> <!-- Host bridge --> <address type='pci' domain='0' bus='0' slot='0''/> Isn't this saying that the bridge connects to itself? (since bus 0 is
On 04/09/2013 04:58 AM, Daniel P. Berrange wrote: this bus)
I understand (again, possibly wrongly) that the builtin PCI bus connects to the chipset using its own slot 0 (that's why it's reserved), but that's its address on itself. How is this bridge associated with the pci-root?
Ah, I *think* I see it - the domain attribute of the pci controller is matched to the index of the pci-root controller, correct? But there's still something strange about the <address> of the pci controller being self-referential. Yes, the index of the pci-root matches the 'domain' of <address>
Okay, then the way that libvirt differentiates between a pci bridge that is connected to the root, and one that is connected to a slot of another bridge is 1) the "bus" attribute of the bridge's <address> matches the "index" attribute of the bridge itself, and 2) "slot" is always 0. Correct?
(The corollary of this is that if slot == 0 and bus != index, or bus == index and slot != 0, it is a configuration error).
I'm still unclear on the usefulness of the pci-root controller though - all the necessary information is contained in the pci controller, except for the type of root. But in the case of pcie root, I think you're not allowed to connect a standard bridge to it, only a "dmi-to-pci-bridge" (i82801b11-bridge) Yes you can connect a pci bridge to pcie-root. It's represented as a root complex integrated device.
Is this accurate? Per the PCI express spec, any PCI express device needs to have a PCI express capability, which our pci-bridge does not. I think this is one of the main differences for our i82801b11-bridge, that it exposes itself as a root complex integrated endpoint, so we know it's effectively a PCIe-to-PCI bridge. We'll be asking for trouble if/when we get guest IOMMU support if we are lax about using PCI-to-PCI bridges where we should have PCIe-to-PCI bridges. There are plenty of examples to the contrary of root complex integrated endpoints without an express capability, but that doesn't make it correct to the spec.
ARGHH!! Just when I think I'm starting to understand *something* about these devices...
(later edit: after some coaching on IRC, I *think* I've got a bit better handle on it.)
</controller> <interface type='direct'> ... <address type='pci' domain='0' bus='0' slot='3'/> </controller>
- One PCI root with built-in PCI bus and extra PCI bridge
<controller type="pci-root" index="0"> <model name="i440FX"/> </controller> <controller type="pci" index="0"> <!-- Host bridge --> <address type='pci' domain='0' bus='0' slot='0'/> </controller> <controller type="pci" index="1"> <!-- Additional bridge --> <address type='pci' domain='0' bus='0' slot='1'/> </controller> <interface type='direct'> ... <address type='pci' domain='0' bus='1' slot='3'/> </controller>
- One PCI root with built-in PCI bus, PCI-E bus and and extra PCI bridge (ie possible q35 setup) Why would a q35 machine have an i440FX pci-root? It shouldn't, that's a typo
<controller type="pci-root" index="0"> <model name="i440FX"/> </controller> <controller type="pci" index="0"> <!-- Host bridge --> <address type='pci' domain='0' bus='0' slot='0'/> </controller> <controller type="pci" index="1"> <!-- Additional bridge --> <address type='pci' domain='0' bus='0' slot='1'/> </controller> <controller type="pci" index="1"> <!-- Additional bridge --> <address type='pci' domain='0' bus='0' slot='1'/> </controller> I think you did a cut-paste here and intended to change something, but didn't - those two bridges are identical. Yep, the slot should be 2 in the second one
<interface type='direct'> ... <address type='pci' domain='0' bus='1' slot='3'/> </controller>
So if we later allowed for mutiple PCI roots, then we'd have something like
<controller type="pci-root" index="0"> <model name="i440FX"/> </controller> <controller type="pci-root" index="1"> <model name="i440FX"/> </controller> <controller type="pci" index="0"> <!-- Host bridge 1 --> <address type='pci' domain='0' bus='0' slot='0''/> </controller> <controller type="pci" index="0"> <!-- Host bridge 2 --> <address type='pci' domain='1' bus='0' slot='0''/> </controller>
There is a problem here - within a given controller type, we will now have the possibility of multiple controllers with the same index - the differentiating attribute will be in the <address> subelement, which could create some awkwardness. Maybe instead this should be handled with a different model of pci controller, and we can add a "domain" attribute at the toplevel rather than specifying an <address>?
On real hardware, the platform can specify the _BBN (Base Bus Number = bus) and the _SEG (Segment = domain) of the host bridge. So perhaps you want something like: <controller type="pci-host-bridge"> <model name="i440FX"/> <address type="pci-host-bridge-addr" domain='1' bus='0'/> </controller> "index" is confusing to me.
<interface type='direct'> <!-- NIC on host bridge 2 --> ... <address type='pci' domain='1' bus='0' slot='3'/> </controller>
NB this means that 'index' values can be reused against the <controller>, provided they are setup on different pci-roots.
> (also note that it might happen that the bus number in libvirt's config > will correspond to the bus numbering that shows up in the guest OS, but > that will just be a happy coincidence) > > Does this make sense? Yep, I think we're fairly close. What about the other types of pci controllers that are used by PCIe? We should make sure they fit in this model before we settle on it. What do they do ?
(The descriptions of different models below tell what each of these other devices does; in short, they're all just some sort of electronic Lego to help connect PCI and PCIe devices into a tree).
Okay, I'll make yet another attempt at understanding these devices, and suggesting how they can all be described in the XML. I'm thinking that *all* of the express hubs, switch ports, bridges, etc can be described in xml in the manner above, i.e.:
<controller type='pci' index='n'> <model type='xxx'/> </controller>
and that the method for connecting a device to any of them would be by specifying:
<address type='pci' domain='n' bus='n' slot='n' function='n'/>
Any limitations about which devices/controllers can connect to which controllers, and how many devices can connect to any particular controller will be derived from the <model type='xxx'/>. (And, as we've said before, although qemu doesn't assign each of these controllers a numeric bus id, and although we can make no guarantee that the bus id we use for a particular controller is what will be used by the guest BIOS/OS, it's still a convenient notation and works well with other hypervisors as well as qemu. I'll also note that when I run lspci on an X58-based machine I have here, *all* of the relationships between all the devices listed below are described with simple bus:slot.function numbers.)
Here is a list of the pci controller model types and their restrictions (thanks to mst and aw for repeating these over and over to me; I'm sure I still have made mistakes, but at least it's getting closer).
<controller type='pci-root'> ============================
Upstream: nothing Downstream: only a single pci-root-bus (implied) qemu commandline: nothing (it's implied in the q35 machinetype)
Explanation:
Each machine will have a different controller called "pci-root" as outlined above by Daniel. Two types of pci-root will be supported: i440FX and q35. If a pci-root is not spelled out in the config, one will be auto-added (depending on machinetype).
An i440FX pci-root has an implicitly added pci-bridge at 0:0:0.0 (and any bridge that has an address of slot='0' on its own bus is, by definition, connected to a pci-root controller - the two are matched by setting "domain" in the address of the pci-bridge to "index" of the pci-root). This bridge can only have PCI devices added.
A q35 pci-root also implies a different kind of pci-bridge device - one that can only have PCIe devices/controllers attached, but is otherwise identical to the pci-bridge added for i440FX. This bus will be called "root-bus" (Note that there are generally followed conventions for what can be connected to which slot on this bus, and we will probably follow those conventions when building a machine, *but* we will not hardcode this convention into libvirt; each q35 machine will be an empty slate)
<controller type='pci'> =======================
This will be used for *all* of the following controller devices supported by qemu:
<model type='pcie-root-bus'/> (implicit/integrated) ----------------------------
Upstream: connect to pci-root controller *only* Downstream: 32 slots, PCIe devices only, no hotplug. qemu commandline: nothing (implicit in the q35-* machinetype)
This controller is the bus described above that connects to a q35's pci-root, and provides places for PCIe devices to connect. Examples are root-ports, dmi-to-pci-bridges sata controllers, integrated sound/usb/ethernet devices (do any of those that can be connected to the pcie-root-bus exist yet?).
There is only one of these controllers, and it will *always* be index='0', and will always have the following address:
<address type='pci' domain='0' bus='0' slot='0' function='0'/>
Implicit devices make me nervous, why wouldn't this just be a pcie-root (or pcie-host-bridge)? If we want to support multiple host bridges, there can certainly be more than one, so the index='0' assumption seems to fall apart.
<model type='root-port'/> (ioh3420) -------------------------
Upstream: PCIe, connect to pcie-root-bus *only* (?)
yes
Downstream: 1 slot, PCIe devices only (?)
yes
qemu commandline: -device ioh3420,...
These can only connect to the "pcie-root-bus" of of a q35 (implying that this bus will need to have a different model name than the simple "pci-bridge"
<model type='dmi-to-pci-bridge'/> (i82801b11-bridge)
I'm worried this name is either too specific or too generic. What happens when we add a generic pcie-bridge and want to use that instead of the i82801b11-bridge? The guest really only sees this as a PCIe-to-PCI bridge, it just happens that on q35 this attaches at the DMI port of the MCH.
---------------------------------
(btw, what does "dmi" mean?)
http://en.wikipedia.org/wiki/Direct_Media_Interface
Upstream: pcie-root-bus *only*
And only to a specific q35 slot (1e.0) for the i82801b11-bridge.
Downstream: 32 slots, any PCI device, no hotplug (?)
Yet, but I think this is where we want to implement ACPI based hotplug.
qemu commandline: -device i82801b11-bridge,...
<model type='upstream-switch-port'/> (x3130-upstream) ------------------------------------
Upstream: PCIe, connect to pcie-root-bus, root-port, or downstream-switch-port (?)
yes
Downstream: 32 slots, connect *only* to downstream-switch-port
I can't verify that there are 32 slots, mst? I've only setup downstream ports within slot 0.
qemu-commandline: -device x3130-upstream
This is the upper side of a switch that can multiplex multiple devices onto a single port. It's only useful when one or more downstream switch ports are connected to it.
<model type='downstream-switch-port'/> (xio3130-downstream) --------------------------------------
Upstream: connect *only* to upstream-switch-port Downstream: 1 slot, any PCIe device qemu commandline: -device xio3130-downstream
You can connect one or more of these to an upstream-switch-port in order to effectively plug multiple devices into a single PCIe port.
<model type='pci-bridge'/> (pci-bridge) --------------------------
Upstream: PCI, connect to 1) pci-root, 2) dmi-to-pci-bridge, 3) another pci-bridge Downstream: any PCI device, 32 slots qemu commandline: -device pci-bridge,...
This differs from dmi-to-pci-bridge in that its upstream connection is PCI rather than PCIe (so it will work on an i440FX system, which has no root PCIe bus) and that hotplug is supported. In general, if a guest will have any PCI devices, one of these controllers should be added, and
===============================================================
Comment: I'm not quite convinced that we really need the separate "pci-root" device. Since 1) every pci-root will *always* have either a pcie-root-bus or a pci-bridge connected to it, 2) the pci-root-bus will only ever be connected to the pci-root, and 3) the pci-bridge that connects to it will need special handling within the pci-bridge case anyway, why not:
1) eliminate the separate pci-root controller type
2) within <controller type='pci'>, a new <model type='pci-root-bus'/> will be added.
3) a pcie-root-bus will automatically be added for q35 machinetypes, and pci-root-bus for any machinetype that supports a PCI bus (e.g. "pc-*")
4) model type='pci-root-bus' will behave like pci-bridge, except that it will be an implicit device (nothing on qemu commandline) and it won't need an <address> element (neither will pcie-root-bus).
I think they should both have a domain + bus address to make it possible to build multi-domain/multi-host bridge systems. They do not use any slots through.
5) to support multiple domains, we can simply add a "domain" attribute to the toplevel of controller.
Or this Wouldn't even be unnecessary if we supported a 'pci-root-addr' address type for the above with the default being domain=0, bus=0? I suppose it doesn't matter whether it's a separate attribute or new address type though. Thanks, Alex

On Mon, Apr 15, 2013 at 11:27:03AM -0600, Alex Williamson wrote:
On Fri, 2013-04-12 at 11:46 -0400, Laine Stump wrote:
On 04/11/2013 07:23 AM, Michael S. Tsirkin wrote:
On Thu, Apr 11, 2013 at 07:03:56AM -0400, Laine Stump wrote:
On 04/10/2013 05:26 AM, Daniel P. Berrange wrote:
On Tue, Apr 09, 2013 at 04:06:06PM -0400, Laine Stump wrote:
On 04/09/2013 04:58 AM, Daniel P. Berrange wrote: > On Mon, Apr 08, 2013 at 03:32:07PM -0400, Laine Stump wrote: > Actually I do wonder if we should reprent a PCI root as two > <controller> elements, one representing the actual PCI root > device, and the other representing the host bridge that is > built-in. > > Also we should use the actual model names, not 'pci-root' or > 'pcie-root' but rather i440FX for "pc" machine type, and whatever > the q35 model name is. > > - One PCI root with built-in PCI bus (ie todays' setup) > > <controller type="pci-root" index="0"> > <model name="i440FX"/> > </controller> > <controller type="pci" index="0"> <!-- Host bridge --> > <address type='pci' domain='0' bus='0' slot='0''/> Isn't this saying that the bridge connects to itself? (since bus 0 is this bus)
I understand (again, possibly wrongly) that the builtin PCI bus connects to the chipset using its own slot 0 (that's why it's reserved), but that's its address on itself. How is this bridge associated with the pci-root?
Ah, I *think* I see it - the domain attribute of the pci controller is matched to the index of the pci-root controller, correct? But there's still something strange about the <address> of the pci controller being self-referential. Yes, the index of the pci-root matches the 'domain' of <address>
Okay, then the way that libvirt differentiates between a pci bridge that is connected to the root, and one that is connected to a slot of another bridge is 1) the "bus" attribute of the bridge's <address> matches the "index" attribute of the bridge itself, and 2) "slot" is always 0. Correct?
(The corollary of this is that if slot == 0 and bus != index, or bus == index and slot != 0, it is a configuration error).
I'm still unclear on the usefulness of the pci-root controller though - all the necessary information is contained in the pci controller, except for the type of root. But in the case of pcie root, I think you're not allowed to connect a standard bridge to it, only a "dmi-to-pci-bridge" (i82801b11-bridge) Yes you can connect a pci bridge to pcie-root. It's represented as a root complex integrated device.
Is this accurate? Per the PCI express spec, any PCI express device needs to have a PCI express capability, which our pci-bridge does not. I think this is one of the main differences for our i82801b11-bridge, that it exposes itself as a root complex integrated endpoint, so we know it's effectively a PCIe-to-PCI bridge.
If it does not have an express link upstream it's not a PCIe-to-PCI bridge, is it?
We'll be asking for trouble if/when we get guest IOMMU support if we are lax about using PCI-to-PCI bridges where we should have PCIe-to-PCI bridges.
I recall the spec saying somewhere that integrated endpoints are outside the root complex hierarchy. I think IOMMU will simply not apply to these.
There are plenty of examples to the contrary of root complex integrated endpoints without an express capability, but that doesn't make it correct to the spec.
Is there something in the spec explicitly forbidding this? I merely find: The PCI Express Capability structure is required for PCI Express device Functions. So if it's not an express device it does not have to have an express capability? Maybe we should send an example dump to pci sig and ask them...
ARGHH!! Just when I think I'm starting to understand *something* about these devices...
(later edit: after some coaching on IRC, I *think* I've got a bit better handle on it.)
> </controller> > <interface type='direct'> > ... > <address type='pci' domain='0' bus='0' slot='3'/> > </controller> > > - One PCI root with built-in PCI bus and extra PCI bridge > > <controller type="pci-root" index="0"> > <model name="i440FX"/> > </controller> > <controller type="pci" index="0"> <!-- Host bridge --> > <address type='pci' domain='0' bus='0' slot='0'/> > </controller> > <controller type="pci" index="1"> <!-- Additional bridge --> > <address type='pci' domain='0' bus='0' slot='1'/> > </controller> > <interface type='direct'> > ... > <address type='pci' domain='0' bus='1' slot='3'/> > </controller> > > - One PCI root with built-in PCI bus, PCI-E bus and and extra PCI bridge > (ie possible q35 setup) Why would a q35 machine have an i440FX pci-root? It shouldn't, that's a typo
> <controller type="pci-root" index="0"> > <model name="i440FX"/> > </controller> > <controller type="pci" index="0"> <!-- Host bridge --> > <address type='pci' domain='0' bus='0' slot='0'/> > </controller> > <controller type="pci" index="1"> <!-- Additional bridge --> > <address type='pci' domain='0' bus='0' slot='1'/> > </controller> > <controller type="pci" index="1"> <!-- Additional bridge --> > <address type='pci' domain='0' bus='0' slot='1'/> > </controller> I think you did a cut-paste here and intended to change something, but didn't - those two bridges are identical. Yep, the slot should be 2 in the second one
> <interface type='direct'> > ... > <address type='pci' domain='0' bus='1' slot='3'/> > </controller> > > So if we later allowed for mutiple PCI roots, then we'd have something > like > > <controller type="pci-root" index="0"> > <model name="i440FX"/> > </controller> > <controller type="pci-root" index="1"> > <model name="i440FX"/> > </controller> > <controller type="pci" index="0"> <!-- Host bridge 1 --> > <address type='pci' domain='0' bus='0' slot='0''/> > </controller> > <controller type="pci" index="0"> <!-- Host bridge 2 --> > <address type='pci' domain='1' bus='0' slot='0''/> > </controller>
There is a problem here - within a given controller type, we will now have the possibility of multiple controllers with the same index - the differentiating attribute will be in the <address> subelement, which could create some awkwardness. Maybe instead this should be handled with a different model of pci controller, and we can add a "domain" attribute at the toplevel rather than specifying an <address>?
On real hardware, the platform can specify the _BBN (Base Bus Number = bus) and the _SEG (Segment = domain) of the host bridge. So perhaps you want something like:
<controller type="pci-host-bridge"> <model name="i440FX"/> <address type="pci-host-bridge-addr" domain='1' bus='0'/> </controller>
Yes, we could specify segments, though it's not the same as a domain as linux guests define it (I assume this is what libvirt wants to call a domain): if memory serves a segment does not have to be a root based hierarchy, linux domains are all root based. We are better off not specifying BBN for all buses I think - it's intended for multi-root support for legacy OSes.
"index" is confusing to me.
I'd prefer ID for bus not a number, I'm concerned users will assume it's bus number and get confused by a mismatch.
> <interface type='direct'> <!-- NIC on host bridge 2 --> > ... > <address type='pci' domain='1' bus='0' slot='3'/> > </controller> > > > NB this means that 'index' values can be reused against the > <controller>, provided they are setup on different pci-roots. > >> (also note that it might happen that the bus number in libvirt's config >> will correspond to the bus numbering that shows up in the guest OS, but >> that will just be a happy coincidence) >> >> Does this make sense? > Yep, I think we're fairly close. What about the other types of pci controllers that are used by PCIe? We should make sure they fit in this model before we settle on it. What do they do ?
(The descriptions of different models below tell what each of these other devices does; in short, they're all just some sort of electronic Lego to help connect PCI and PCIe devices into a tree).
Okay, I'll make yet another attempt at understanding these devices, and suggesting how they can all be described in the XML. I'm thinking that *all* of the express hubs, switch ports, bridges, etc can be described in xml in the manner above, i.e.:
<controller type='pci' index='n'> <model type='xxx'/> </controller>
and that the method for connecting a device to any of them would be by specifying:
<address type='pci' domain='n' bus='n' slot='n' function='n'/>
Any limitations about which devices/controllers can connect to which controllers, and how many devices can connect to any particular controller will be derived from the <model type='xxx'/>. (And, as we've said before, although qemu doesn't assign each of these controllers a numeric bus id, and although we can make no guarantee that the bus id we use for a particular controller is what will be used by the guest BIOS/OS, it's still a convenient notation and works well with other hypervisors as well as qemu. I'll also note that when I run lspci on an X58-based machine I have here, *all* of the relationships between all the devices listed below are described with simple bus:slot.function numbers.)
Here is a list of the pci controller model types and their restrictions (thanks to mst and aw for repeating these over and over to me; I'm sure I still have made mistakes, but at least it's getting closer).
<controller type='pci-root'> ============================
Upstream: nothing Downstream: only a single pci-root-bus (implied) qemu commandline: nothing (it's implied in the q35 machinetype)
Explanation:
Each machine will have a different controller called "pci-root" as outlined above by Daniel. Two types of pci-root will be supported: i440FX and q35. If a pci-root is not spelled out in the config, one will be auto-added (depending on machinetype).
An i440FX pci-root has an implicitly added pci-bridge at 0:0:0.0 (and any bridge that has an address of slot='0' on its own bus is, by definition, connected to a pci-root controller - the two are matched by setting "domain" in the address of the pci-bridge to "index" of the pci-root). This bridge can only have PCI devices added.
A q35 pci-root also implies a different kind of pci-bridge device - one that can only have PCIe devices/controllers attached, but is otherwise identical to the pci-bridge added for i440FX. This bus will be called "root-bus" (Note that there are generally followed conventions for what can be connected to which slot on this bus, and we will probably follow those conventions when building a machine, *but* we will not hardcode this convention into libvirt; each q35 machine will be an empty slate)
<controller type='pci'> =======================
This will be used for *all* of the following controller devices supported by qemu:
<model type='pcie-root-bus'/> (implicit/integrated) ----------------------------
Upstream: connect to pci-root controller *only* Downstream: 32 slots, PCIe devices only, no hotplug. qemu commandline: nothing (implicit in the q35-* machinetype)
This controller is the bus described above that connects to a q35's pci-root, and provides places for PCIe devices to connect. Examples are root-ports, dmi-to-pci-bridges sata controllers, integrated sound/usb/ethernet devices (do any of those that can be connected to the pcie-root-bus exist yet?).
There is only one of these controllers, and it will *always* be index='0', and will always have the following address:
<address type='pci' domain='0' bus='0' slot='0' function='0'/>
Implicit devices make me nervous, why wouldn't this just be a pcie-root (or pcie-host-bridge)? If we want to support multiple host bridges, there can certainly be more than one, so the index='0' assumption seems to fall apart.
<model type='root-port'/> (ioh3420) -------------------------
Upstream: PCIe, connect to pcie-root-bus *only* (?)
yes
Downstream: 1 slot, PCIe devices only (?)
yes
qemu commandline: -device ioh3420,...
These can only connect to the "pcie-root-bus" of of a q35 (implying that this bus will need to have a different model name than the simple "pci-bridge"
<model type='dmi-to-pci-bridge'/> (i82801b11-bridge)
I'm worried this name is either too specific or too generic. What happens when we add a generic pcie-bridge and want to use that instead of the i82801b11-bridge? The guest really only sees this as a PCIe-to-PCI bridge, it just happens that on q35 this attaches at the DMI port of the MCH.
---------------------------------
(btw, what does "dmi" mean?)
http://en.wikipedia.org/wiki/Direct_Media_Interface
Upstream: pcie-root-bus *only*
And only to a specific q35 slot (1e.0) for the i82801b11-bridge.
Downstream: 32 slots, any PCI device, no hotplug (?)
Yet, but I think this is where we want to implement ACPI based hotplug.
qemu commandline: -device i82801b11-bridge,...
<model type='upstream-switch-port'/> (x3130-upstream) ------------------------------------
Upstream: PCIe, connect to pcie-root-bus, root-port, or downstream-switch-port (?)
yes
Downstream: 32 slots, connect *only* to downstream-switch-port
I can't verify that there are 32 slots, mst? I've only setup downstream ports within slot 0.
qemu-commandline: -device x3130-upstream
This is the upper side of a switch that can multiplex multiple devices onto a single port. It's only useful when one or more downstream switch ports are connected to it.
<model type='downstream-switch-port'/> (xio3130-downstream) --------------------------------------
Upstream: connect *only* to upstream-switch-port Downstream: 1 slot, any PCIe device qemu commandline: -device xio3130-downstream
You can connect one or more of these to an upstream-switch-port in order to effectively plug multiple devices into a single PCIe port.
<model type='pci-bridge'/> (pci-bridge) --------------------------
Upstream: PCI, connect to 1) pci-root, 2) dmi-to-pci-bridge, 3) another pci-bridge Downstream: any PCI device, 32 slots qemu commandline: -device pci-bridge,...
This differs from dmi-to-pci-bridge in that its upstream connection is PCI rather than PCIe (so it will work on an i440FX system, which has no root PCIe bus) and that hotplug is supported. In general, if a guest will have any PCI devices, one of these controllers should be added, and
===============================================================
Comment: I'm not quite convinced that we really need the separate "pci-root" device. Since 1) every pci-root will *always* have either a pcie-root-bus or a pci-bridge connected to it, 2) the pci-root-bus will only ever be connected to the pci-root, and 3) the pci-bridge that connects to it will need special handling within the pci-bridge case anyway, why not:
1) eliminate the separate pci-root controller type
2) within <controller type='pci'>, a new <model type='pci-root-bus'/> will be added.
3) a pcie-root-bus will automatically be added for q35 machinetypes, and pci-root-bus for any machinetype that supports a PCI bus (e.g. "pc-*")
4) model type='pci-root-bus' will behave like pci-bridge, except that it will be an implicit device (nothing on qemu commandline) and it won't need an <address> element (neither will pcie-root-bus).
I think they should both have a domain + bus address to make it possible to build multi-domain/multi-host bridge systems. They do not use any slots through.
5) to support multiple domains, we can simply add a "domain" attribute to the toplevel of controller.
Or this Wouldn't even be unnecessary if we supported a 'pci-root-addr' address type for the above with the default being domain=0, bus=0? I suppose it doesn't matter whether it's a separate attribute or new address type though. Thanks,
Alex
Also AFAIK there's nothing in the spec that requires bus=0 to be root. The _BBN hack above is used sometimes to give !=0 bus numbers to roots. -- MST

On 04/15/2013 05:58 PM, Michael S. Tsirkin wrote:
On Mon, Apr 15, 2013 at 11:27:03AM -0600, Alex Williamson wrote:
On 04/11/2013 07:23 AM, Michael S. Tsirkin wrote:
On Thu, Apr 11, 2013 at 07:03:56AM -0400, Laine Stump wrote:
On 04/10/2013 05:26 AM, Daniel P. Berrange wrote:
On Tue, Apr 09, 2013 at 04:06:06PM -0400, Laine Stump wrote: > On 04/09/2013 04:58 AM, Daniel P. Berrange wrote: >> On Mon, Apr 08, 2013 at 03:32:07PM -0400, Laine Stump wrote: >> Actually I do wonder if we should reprent a PCI root as two >> <controller> elements, one representing the actual PCI root >> device, and the other representing the host bridge that is >> built-in. >> >> Also we should use the actual model names, not 'pci-root' or >> 'pcie-root' but rather i440FX for "pc" machine type, and whatever >> the q35 model name is. >> >> - One PCI root with built-in PCI bus (ie todays' setup) >> >> <controller type="pci-root" index="0"> >> <model name="i440FX"/> >> </controller> >> <controller type="pci" index="0"> <!-- Host bridge --> >> <address type='pci' domain='0' bus='0' slot='0''/> > Isn't this saying that the bridge connects to itself? (since bus 0 is > this bus) > > I understand (again, possibly wrongly) that the builtin PCI bus connects > to the chipset using its own slot 0 (that's why it's reserved), but > that's its address on itself. How is this bridge associated with the > pci-root? > > Ah, I *think* I see it - the domain attribute of the pci controller is > matched to the index of the pci-root controller, correct? But there's > still something strange about the <address> of the pci controller being > self-referential. Yes, the index of the pci-root matches the 'domain' of <address> Okay, then the way that libvirt differentiates between a pci bridge that is connected to the root, and one that is connected to a slot of another bridge is 1) the "bus" attribute of the bridge's <address> matches the "index" attribute of the bridge itself, and 2) "slot" is always 0. Correct?
(The corollary of this is that if slot == 0 and bus != index, or bus == index and slot != 0, it is a configuration error).
I'm still unclear on the usefulness of the pci-root controller though - all the necessary information is contained in the pci controller, except for the type of root. But in the case of pcie root, I think you're not allowed to connect a standard bridge to it, only a "dmi-to-pci-bridge" (i82801b11-bridge) Yes you can connect a pci bridge to pcie-root. It's represented as a root complex integrated device. Is this accurate? Per the PCI express spec, any PCI express device needs to have a PCI express capability, which our pci-bridge does not. I think this is one of the main differences for our i82801b11-bridge,
On Fri, 2013-04-12 at 11:46 -0400, Laine Stump wrote: that it exposes itself as a root complex integrated endpoint, so we know it's effectively a PCIe-to-PCI bridge. If it does not have an express link upstream it's not a PCIe-to-PCI bridge, is it?
To my untrained ear it sounds like you're disagreeing with yourself ???
We'll be asking for trouble if/when we get guest IOMMU support if we are lax about using PCI-to-PCI bridges where we should have PCIe-to-PCI bridges. I recall the spec saying somewhere that integrated endpoints are outside the root complex hierarchy. I think IOMMU will simply not apply to these.
Correct me if I'm wrong - I think libvirt can ignore this bit of debate other than to use its result to determine which devices are allowed to connect to which other devices, right?
There are plenty of examples to the contrary of root complex integrated endpoints without an express capability, but that doesn't make it correct to the spec. Is there something in the spec explicitly forbidding this? I merely find: The PCI Express Capability structure is required for PCI Express device Functions. So if it's not an express device it does not have to have an express capability?
Maybe we should send an example dump to pci sig and ask them...
ARGHH!! Just when I think I'm starting to understand *something* about these devices...
(later edit: after some coaching on IRC, I *think* I've got a bit better handle on it.)
(But I guess not good enough :-P)
>> </controller> >> <interface type='direct'> >> ... >> <address type='pci' domain='0' bus='0' slot='3'/> >> </controller> >> >> - One PCI root with built-in PCI bus and extra PCI bridge >> >> <controller type="pci-root" index="0"> >> <model name="i440FX"/> >> </controller> >> <controller type="pci" index="0"> <!-- Host bridge --> >> <address type='pci' domain='0' bus='0' slot='0'/> >> </controller> >> <controller type="pci" index="1"> <!-- Additional bridge --> >> <address type='pci' domain='0' bus='0' slot='1'/> >> </controller> >> <interface type='direct'> >> ... >> <address type='pci' domain='0' bus='1' slot='3'/> >> </controller> >> >> - One PCI root with built-in PCI bus, PCI-E bus and and extra PCI bridge >> (ie possible q35 setup) > Why would a q35 machine have an i440FX pci-root? It shouldn't, that's a typo
>> <controller type="pci-root" index="0"> >> <model name="i440FX"/> >> </controller> >> <controller type="pci" index="0"> <!-- Host bridge --> >> <address type='pci' domain='0' bus='0' slot='0'/> >> </controller> >> <controller type="pci" index="1"> <!-- Additional bridge --> >> <address type='pci' domain='0' bus='0' slot='1'/> >> </controller> >> <controller type="pci" index="1"> <!-- Additional bridge --> >> <address type='pci' domain='0' bus='0' slot='1'/> >> </controller> > I think you did a cut-paste here and intended to change something, but > didn't - those two bridges are identical. Yep, the slot should be 2 in the second one
>> <interface type='direct'> >> ... >> <address type='pci' domain='0' bus='1' slot='3'/> >> </controller> >> >> So if we later allowed for mutiple PCI roots, then we'd have something >> like >> >> <controller type="pci-root" index="0"> >> <model name="i440FX"/> >> </controller> >> <controller type="pci-root" index="1"> >> <model name="i440FX"/> >> </controller> >> <controller type="pci" index="0"> <!-- Host bridge 1 --> >> <address type='pci' domain='0' bus='0' slot='0''/> >> </controller> >> <controller type="pci" index="0"> <!-- Host bridge 2 --> >> <address type='pci' domain='1' bus='0' slot='0''/> >> </controller>
There is a problem here - within a given controller type, we will now have the possibility of multiple controllers with the same index - the differentiating attribute will be in the <address> subelement, which could create some awkwardness. Maybe instead this should be handled with a different model of pci controller, and we can add a "domain" attribute at the toplevel rather than specifying an <address>?
On real hardware, the platform can specify the _BBN (Base Bus Number = bus) and the _SEG (Segment = domain) of the host bridge. So perhaps you want something like:
<controller type="pci-host-bridge"> <model name="i440FX"/> <address type="pci-host-bridge-addr" domain='1' bus='0'/> </controller>
The <address> element is intended to specify where a device or controller is connected *to*, not what bus/domain it *provides*. I think you're intending for this to provide domain 1 bus 0, so according to existing convention, you would want that information in the <controller> element attributes (e.g. for all other controller types, the generic "index" attribute is used to indicate a bus number when such a thing is appropriate for that type of controller). Anyway, I've simplified this a bit in my latest iteration - there are no separate "root" and "root bus" controllers, just a "pci-root" (for i440FX) or "pcie-root" (for q35), both of which provide a "pci" bus (I'm using the term loosely here), each with different restrictions about what can be connected.
Yes, we could specify segments, though it's not the same as a domain as linux guests define it (I assume this is what libvirt wants to call a domain): if memory serves a segment does not have to be a root based hierarchy, linux domains are all root based.
I'm not exactly sure of the meanings/implications of all those terms, but from the point of view of libvirt, as long as we can represent all possible connections between devices using the domain:bus:slot.function notation, I think it doesn't matter too much.
We are better off not specifying BBN for all buses I think -
How would you differentiate between the different buses without some sort of identifier?
it's intended for multi-root support for legacy OSes.
"index" is confusing to me.
index is being used just because that's been the convention for other controller types - when there are multiple controllers of the same type, each is given an index, and that's used in the "child" devices to indicate which of the parent controllers they connect to.
I'd prefer ID for bus not a number, I'm concerned users will assume it's bus number and get confused by a mismatch.
So you would rather that they were something like this? <controller type='pci' bus='pci.0'> <model type='pci-root'/> </controller> <interface type='blah'> ... <address type='pci' domain='0' bus='pci.0' slot='0' function='0'/> </interface> The problem is that the use of numeric bus IDs is fairly deeply ingrained in libvirt; every existing libvirt guest config has device addresses specifying "bus='0'" Switching to using an alphanumeric ID rather than a simple number would require extra care to maintain backward compatibility with all those existing configs and previous versions of libvirt that might end up being the recipient of xml generated by a newer libvirt. Because of this, at the very least the pci.0 bus must be referred to as bus='0'; once we've done that, we might as well refer to them *all* numerically (anyway, even if names were allowed, I'm sure everybody would just call them '1', '2', (or at the very most "pci.1", "pci.2") etc. anyway.
>> <interface type='direct'> <!-- NIC on host bridge 2 --> >> ... >> <address type='pci' domain='1' bus='0' slot='3'/> >> </controller> >> >> >> NB this means that 'index' values can be reused against the >> <controller>, provided they are setup on different pci-roots. >> >>> (also note that it might happen that the bus number in libvirt's config >>> will correspond to the bus numbering that shows up in the guest OS, but >>> that will just be a happy coincidence) >>> >>> Does this make sense? >> Yep, I think we're fairly close. > What about the other types of pci controllers that are used by PCIe? We > should make sure they fit in this model before we settle on it. What do they do ? (The descriptions of different models below tell what each of these other devices does; in short, they're all just some sort of electronic Lego to help connect PCI and PCIe devices into a tree).
Okay, I'll make yet another attempt at understanding these devices, and suggesting how they can all be described in the XML. I'm thinking that *all* of the express hubs, switch ports, bridges, etc can be described in xml in the manner above, i.e.:
<controller type='pci' index='n'> <model type='xxx'/> </controller>
and that the method for connecting a device to any of them would be by specifying:
<address type='pci' domain='n' bus='n' slot='n' function='n'/>
Any limitations about which devices/controllers can connect to which controllers, and how many devices can connect to any particular controller will be derived from the <model type='xxx'/>. (And, as we've said before, although qemu doesn't assign each of these controllers a numeric bus id, and although we can make no guarantee that the bus id we use for a particular controller is what will be used by the guest BIOS/OS, it's still a convenient notation and works well with other hypervisors as well as qemu. I'll also note that when I run lspci on an X58-based machine I have here, *all* of the relationships between all the devices listed below are described with simple bus:slot.function numbers.)
Here is a list of the pci controller model types and their restrictions (thanks to mst and aw for repeating these over and over to me; I'm sure I still have made mistakes, but at least it's getting closer).
<controller type='pci-root'> ============================
Upstream: nothing Downstream: only a single pci-root-bus (implied) qemu commandline: nothing (it's implied in the q35 machinetype)
Explanation:
Each machine will have a different controller called "pci-root" as outlined above by Daniel. Two types of pci-root will be supported: i440FX and q35. If a pci-root is not spelled out in the config, one will be auto-added (depending on machinetype).
An i440FX pci-root has an implicitly added pci-bridge at 0:0:0.0 (and any bridge that has an address of slot='0' on its own bus is, by definition, connected to a pci-root controller - the two are matched by setting "domain" in the address of the pci-bridge to "index" of the pci-root). This bridge can only have PCI devices added.
A q35 pci-root also implies a different kind of pci-bridge device - one that can only have PCIe devices/controllers attached, but is otherwise identical to the pci-bridge added for i440FX. This bus will be called "root-bus" (Note that there are generally followed conventions for what can be connected to which slot on this bus, and we will probably follow those conventions when building a machine, *but* we will not hardcode this convention into libvirt; each q35 machine will be an empty slate)
<controller type='pci'> =======================
This will be used for *all* of the following controller devices supported by qemu:
<model type='pcie-root-bus'/> (implicit/integrated) ----------------------------
Upstream: connect to pci-root controller *only* Downstream: 32 slots, PCIe devices only, no hotplug. qemu commandline: nothing (implicit in the q35-* machinetype)
This controller is the bus described above that connects to a q35's pci-root, and provides places for PCIe devices to connect. Examples are root-ports, dmi-to-pci-bridges sata controllers, integrated sound/usb/ethernet devices (do any of those that can be connected to the pcie-root-bus exist yet?).
There is only one of these controllers, and it will *always* be index='0', and will always have the following address:
<address type='pci' domain='0' bus='0' slot='0' function='0'/> Implicit devices make me nervous, why wouldn't this just be a pcie-root (or pcie-host-bridge)? If we want to support multiple host bridges, there can certainly be more than one, so the index='0' assumption seems to fall apart.
That's when we need to start talking about a "domain" attribute, like this: <controller type='pci' domain='1' index='0'> <model type='pcie-root-bus'/> </controller>
<model type='root-port'/> (ioh3420) -------------------------
Upstream: PCIe, connect to pcie-root-bus *only* (?) yes
Downstream: 1 slot, PCIe devices only (?) yes
qemu commandline: -device ioh3420,...
These can only connect to the "pcie-root-bus" of of a q35 (implying that this bus will need to have a different model name than the simple "pci-bridge"
<model type='dmi-to-pci-bridge'/> (i82801b11-bridge) I'm worried this name is either too specific or too generic. What happens when we add a generic pcie-bridge and want to use that instead of the i82801b11-bridge? The guest really only sees this as a PCIe-to-PCI bridge, it just happens that on q35 this attaches at the DMI port of the MCH.
Hehe. Just using the name you (Alex) suggested :-) My use of the "generic" device *type* names rather than exact hardware model names is based on the idea that any given machinetype will have a set of these "building block" devices available, and as long as you use everything from the same "set" on a given machine, it doesn't really matter which set you use. Is this a valid assumption?
---------------------------------
(btw, what does "dmi" mean?) http://en.wikipedia.org/wiki/Direct_Media_Interface
Upstream: pcie-root-bus *only* And only to a specific q35 slot (1e.0) for the i82801b11-bridge.
Downstream: 32 slots, any PCI device, no hotplug (?) Yet, but I think this is where we want to implement ACPI based hotplug.
Okay, but for now libvirt can just refrain from auto-addressing any user-created devices to that bus; we'll just make sure that there is always a "pci-bridge" plugged into it, and auto-addressed devices will all be put there. In the meantime if someone explicitly addresses a device to connect to the i82801b11-bridge, we'll let them do it, but if they try to hot-unplug it they will get an error.
qemu commandline: -device i82801b11-bridge,...
<model type='upstream-switch-port'/> (x3130-upstream) ------------------------------------
Upstream: PCIe, connect to pcie-root-bus, root-port, or downstream-switch-port (?) yes
Downstream: 32 slots, connect *only* to downstream-switch-port I can't verify that there are 32 slots, mst? I've only setup downstream ports within slot 0.
According to a discussion with Don Dutile on IRC yesterday, the downstream side of an upstream-switch-port has 32 "slots" with 8 "functions" each, and each of these functions can have a downstream-switch-port connected. That said, he told me that in every case he's seen in the real world, all the downstream-switch-ports were connected to "function 0", effectively limiting it to 32 downstreams/upstream.
qemu-commandline: -device x3130-upstream
This is the upper side of a switch that can multiplex multiple devices onto a single port. It's only useful when one or more downstream switch ports are connected to it.
<model type='downstream-switch-port'/> (xio3130-downstream) --------------------------------------
Upstream: connect *only* to upstream-switch-port Downstream: 1 slot, any PCIe device qemu commandline: -device xio3130-downstream
You can connect one or more of these to an upstream-switch-port in order to effectively plug multiple devices into a single PCIe port.
<model type='pci-bridge'/> (pci-bridge) --------------------------
Upstream: PCI, connect to 1) pci-root, 2) dmi-to-pci-bridge, 3) another pci-bridge Downstream: any PCI device, 32 slots qemu commandline: -device pci-bridge,...
This differs from dmi-to-pci-bridge in that its upstream connection is PCI rather than PCIe (so it will work on an i440FX system, which has no root PCIe bus) and that hotplug is supported. In general, if a guest will have any PCI devices, one of these controllers should be added, and
===============================================================
Comment: I'm not quite convinced that we really need the separate "pci-root" device. Since 1) every pci-root will *always* have either a pcie-root-bus or a pci-bridge connected to it, 2) the pci-root-bus will only ever be connected to the pci-root, and 3) the pci-bridge that connects to it will need special handling within the pci-bridge case anyway, why not:
1) eliminate the separate pci-root controller type
2) within <controller type='pci'>, a new <model type='pci-root-bus'/> will be added.
3) a pcie-root-bus will automatically be added for q35 machinetypes, and pci-root-bus for any machinetype that supports a PCI bus (e.g. "pc-*")
4) model type='pci-root-bus' will behave like pci-bridge, except that it will be an implicit device (nothing on qemu commandline) and it won't need an <address> element (neither will pcie-root-bus). I think they should both have a domain + bus address to make it possible to build multi-domain/multi-host bridge systems. They do not use any slots through.
Yes. I think I agree with that. But we don't have to implement the multiple-domain stuff today (since qemu doesn't support it yet), and when we do, I think we can just add a "domain" attribute to the main element of pci-root and pcie-root controllers.
5) to support multiple domains, we can simply add a "domain" attribute to the toplevel of controller.
Or this Wouldn't even be unnecessary if we supported a 'pci-root-addr' address type for the above with the default being domain=0, bus=0? I suppose it doesn't matter whether it's a separate attribute or new address type though. Thanks,
I think you're mixing up the purpose of the <address> element vs the "index" attribute in the main <controller> element. To clarify, take this example: <controller type='pci' index='3'> <model type='pci-bridge'/> <address domain='0' bus='1' slot='9' function='0'/> </controller> This controller is connected to slot 9 of the already-existing bus 1. It provides a bus 3 for other devices to connect to. If we wanted to start up a domain 1, we would do something like this: <controller type='pci' domain='1' index='0'> <model type='pci-root'/> </controller> This would give us a PCI bus 0 in domain 1. You could then connect a pci-bridge to it like this: <controller type='pci' domain='1' index='1'> <model type='pci-bridge'/> <address type='pci' domain='1' bus='0' slot='1' function='0'/> </controller> The <address> tells us that this new bus connects to slot 1 of PCI bus 0 in domain 1. The <controller domain='1' index='1'> tells us that there is now a new bus other devices can connect to that is at domain='1' bus='1'.
Also AFAIK there's nothing in the spec that requires bus=0 to be root. The _BBN hack above is used sometimes to give !=0 bus numbers to roots.
I don't really understand that, but do you think that 1) qemu would ever want/be able to model that, or that 2) anyone would ever have a practical reason for wanting to? It's really cool and all to be able to replicate any possible esoteric hardware configuration in a virtual machine, but it seems like the only practical use of replicating something like that would be for someone wanting to test what their OS does when there's no domain=0 in the hardware...

On Tue, Apr 16, 2013 at 12:05:40PM -0400, Laine Stump wrote:
On 04/15/2013 05:58 PM, Michael S. Tsirkin wrote:
On Mon, Apr 15, 2013 at 11:27:03AM -0600, Alex Williamson wrote:
On Fri, 2013-04-12 at 11:46 -0400, Laine Stump wrote:
On 04/11/2013 07:23 AM, Michael S. Tsirkin wrote: it's intended for multi-root support for legacy OSes.
"index" is confusing to me.
index is being used just because that's been the convention for other controller types - when there are multiple controllers of the same type, each is given an index, and that's used in the "child" devices to indicate which of the parent controllers they connect to.
I'd prefer ID for bus not a number, I'm concerned users will assume it's bus number and get confused by a mismatch.
So you would rather that they were something like this?
<controller type='pci' bus='pci.0'> <model type='pci-root'/> </controller> <interface type='blah'> ... <address type='pci' domain='0' bus='pci.0' slot='0' function='0'/> </interface>
The problem is that the use of numeric bus IDs is fairly deeply ingrained in libvirt; every existing libvirt guest config has device addresses specifying "bus='0'" Switching to using an alphanumeric ID rather than a simple number would require extra care to maintain backward compatibility with all those existing configs and previous versions of libvirt that might end up being the recipient of xml generated by a newer libvirt. Because of this, at the very least the pci.0 bus must be referred to as bus='0'; once we've done that, we might as well refer to them *all* numerically (anyway, even if names were allowed, I'm sure everybody would just call them '1', '2', (or at the very most "pci.1", "pci.2") etc. anyway.
We explicitly do *NOT* want to use something like 'pci.1' because that is a QEMU specific naming concept. The use of numeric IDs was a delibrate choice for libvirt addressing and not something we are going to change.
5) to support multiple domains, we can simply add a "domain" attribute to the toplevel of controller.
Or this Wouldn't even be unnecessary if we supported a 'pci-root-addr' address type for the above with the default being domain=0, bus=0? I suppose it doesn't matter whether it's a separate attribute or new address type though. Thanks,
I think you're mixing up the purpose of the <address> element vs the "index" attribute in the main <controller> element. To clarify, take this example:
<controller type='pci' index='3'> <model type='pci-bridge'/> <address domain='0' bus='1' slot='9' function='0'/> </controller>
This controller is connected to slot 9 of the already-existing bus 1. It provides a bus 3 for other devices to connect to. If we wanted to start up a domain 1, we would do something like this:
<controller type='pci' domain='1' index='0'> <model type='pci-root'/> </controller>
This would give us a PCI bus 0 in domain 1. You could then connect a pci-bridge to it like this:
<controller type='pci' domain='1' index='1'> <model type='pci-bridge'/> <address type='pci' domain='1' bus='0' slot='1' function='0'/> </controller>
The <address> tells us that this new bus connects to slot 1 of PCI bus 0 in domain 1. The <controller domain='1' index='1'> tells us that there is now a new bus other devices can connect to that is at domain='1' bus='1'.
Can we just leave multi-domains out of this discussion for now. IMHO they are just causing more confusion to an already complicated problem. Lets focus on just solving the problem of PCI bridges which is what we actually need right now. Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On 04/16/2013 01:35 PM, Daniel P. Berrange wrote:
Can we just leave multi-domains out of this discussion for now. IMHO they are just causing more confusion to an already complicated problem. Lets focus on just solving the problem of PCI bridges which is what we actually need right now.
Sure. I actually find it highly unlikely that anyone will find a true need for multiple domains in a guest for a very long time. I summarized in a separate message yesterday what parts of all this are needed for basic pci-bridge support; the rest (sans multiple domains) will be needed for q35. As far as I know, everyone is now satisfied with the result, is that correct?

On Tue, Apr 16, 2013 at 12:05:40PM -0400, Laine Stump wrote:
On 04/15/2013 05:58 PM, Michael S. Tsirkin wrote:
On Mon, Apr 15, 2013 at 11:27:03AM -0600, Alex Williamson wrote:
On 04/11/2013 07:23 AM, Michael S. Tsirkin wrote:
On Thu, Apr 11, 2013 at 07:03:56AM -0400, Laine Stump wrote:
On 04/10/2013 05:26 AM, Daniel P. Berrange wrote: > On Tue, Apr 09, 2013 at 04:06:06PM -0400, Laine Stump wrote: >> On 04/09/2013 04:58 AM, Daniel P. Berrange wrote: >>> On Mon, Apr 08, 2013 at 03:32:07PM -0400, Laine Stump wrote: >>> Actually I do wonder if we should reprent a PCI root as two >>> <controller> elements, one representing the actual PCI root >>> device, and the other representing the host bridge that is >>> built-in. >>> >>> Also we should use the actual model names, not 'pci-root' or >>> 'pcie-root' but rather i440FX for "pc" machine type, and whatever >>> the q35 model name is. >>> >>> - One PCI root with built-in PCI bus (ie todays' setup) >>> >>> <controller type="pci-root" index="0"> >>> <model name="i440FX"/> >>> </controller> >>> <controller type="pci" index="0"> <!-- Host bridge --> >>> <address type='pci' domain='0' bus='0' slot='0''/> >> Isn't this saying that the bridge connects to itself? (since bus 0 is >> this bus) >> >> I understand (again, possibly wrongly) that the builtin PCI bus connects >> to the chipset using its own slot 0 (that's why it's reserved), but >> that's its address on itself. How is this bridge associated with the >> pci-root? >> >> Ah, I *think* I see it - the domain attribute of the pci controller is >> matched to the index of the pci-root controller, correct? But there's >> still something strange about the <address> of the pci controller being >> self-referential. > Yes, the index of the pci-root matches the 'domain' of <address> Okay, then the way that libvirt differentiates between a pci bridge that is connected to the root, and one that is connected to a slot of another bridge is 1) the "bus" attribute of the bridge's <address> matches the "index" attribute of the bridge itself, and 2) "slot" is always 0. Correct?
(The corollary of this is that if slot == 0 and bus != index, or bus == index and slot != 0, it is a configuration error).
I'm still unclear on the usefulness of the pci-root controller though - all the necessary information is contained in the pci controller, except for the type of root. But in the case of pcie root, I think you're not allowed to connect a standard bridge to it, only a "dmi-to-pci-bridge" (i82801b11-bridge) Yes you can connect a pci bridge to pcie-root. It's represented as a root complex integrated device. Is this accurate? Per the PCI express spec, any PCI express device needs to have a PCI express capability, which our pci-bridge does not. I think this is one of the main differences for our i82801b11-bridge,
On Fri, 2013-04-12 at 11:46 -0400, Laine Stump wrote: that it exposes itself as a root complex integrated endpoint, so we know it's effectively a PCIe-to-PCI bridge. If it does not have an express link upstream it's not a PCIe-to-PCI bridge, is it?
To my untrained ear it sounds like you're disagreeing with yourself ???
We'll be asking for trouble if/when we get guest IOMMU support if we are lax about using PCI-to-PCI bridges where we should have PCIe-to-PCI bridges. I recall the spec saying somewhere that integrated endpoints are outside the root complex hierarchy. I think IOMMU will simply not apply to these.
Correct me if I'm wrong - I think libvirt can ignore this bit of debate other than to use its result to determine which devices are allowed to connect to which other devices, right?
Yes.
There are plenty of examples to the contrary of root complex integrated endpoints without an express capability, but that doesn't make it correct to the spec. Is there something in the spec explicitly forbidding this? I merely find: The PCI Express Capability structure is required for PCI Express device Functions. So if it's not an express device it does not have to have an express capability?
Maybe we should send an example dump to pci sig and ask them...
ARGHH!! Just when I think I'm starting to understand *something* about these devices...
(later edit: after some coaching on IRC, I *think* I've got a bit better handle on it.)
(But I guess not good enough :-P)
>>> </controller> >>> <interface type='direct'> >>> ... >>> <address type='pci' domain='0' bus='0' slot='3'/> >>> </controller> >>> >>> - One PCI root with built-in PCI bus and extra PCI bridge >>> >>> <controller type="pci-root" index="0"> >>> <model name="i440FX"/> >>> </controller> >>> <controller type="pci" index="0"> <!-- Host bridge --> >>> <address type='pci' domain='0' bus='0' slot='0'/> >>> </controller> >>> <controller type="pci" index="1"> <!-- Additional bridge --> >>> <address type='pci' domain='0' bus='0' slot='1'/> >>> </controller> >>> <interface type='direct'> >>> ... >>> <address type='pci' domain='0' bus='1' slot='3'/> >>> </controller> >>> >>> - One PCI root with built-in PCI bus, PCI-E bus and and extra PCI bridge >>> (ie possible q35 setup) >> Why would a q35 machine have an i440FX pci-root? > It shouldn't, that's a typo > >>> <controller type="pci-root" index="0"> >>> <model name="i440FX"/> >>> </controller> >>> <controller type="pci" index="0"> <!-- Host bridge --> >>> <address type='pci' domain='0' bus='0' slot='0'/> >>> </controller> >>> <controller type="pci" index="1"> <!-- Additional bridge --> >>> <address type='pci' domain='0' bus='0' slot='1'/> >>> </controller> >>> <controller type="pci" index="1"> <!-- Additional bridge --> >>> <address type='pci' domain='0' bus='0' slot='1'/> >>> </controller> >> I think you did a cut-paste here and intended to change something, but >> didn't - those two bridges are identical. > Yep, the slot should be 2 in the second one > >>> <interface type='direct'> >>> ... >>> <address type='pci' domain='0' bus='1' slot='3'/> >>> </controller> >>> >>> So if we later allowed for mutiple PCI roots, then we'd have something >>> like >>> >>> <controller type="pci-root" index="0"> >>> <model name="i440FX"/> >>> </controller> >>> <controller type="pci-root" index="1"> >>> <model name="i440FX"/> >>> </controller> >>> <controller type="pci" index="0"> <!-- Host bridge 1 --> >>> <address type='pci' domain='0' bus='0' slot='0''/> >>> </controller> >>> <controller type="pci" index="0"> <!-- Host bridge 2 --> >>> <address type='pci' domain='1' bus='0' slot='0''/> >>> </controller>
There is a problem here - within a given controller type, we will now have the possibility of multiple controllers with the same index - the differentiating attribute will be in the <address> subelement, which could create some awkwardness. Maybe instead this should be handled with a different model of pci controller, and we can add a "domain" attribute at the toplevel rather than specifying an <address>?
On real hardware, the platform can specify the _BBN (Base Bus Number = bus) and the _SEG (Segment = domain) of the host bridge. So perhaps you want something like:
<controller type="pci-host-bridge"> <model name="i440FX"/> <address type="pci-host-bridge-addr" domain='1' bus='0'/> </controller>
The <address> element is intended to specify where a device or controller is connected *to*, not what bus/domain it *provides*. I think you're intending for this to provide domain 1 bus 0, so according to existing convention, you would want that information in the <controller> element attributes (e.g. for all other controller types, the generic "index" attribute is used to indicate a bus number when such a thing is appropriate for that type of controller).
Anyway, I've simplified this a bit in my latest iteration - there are no separate "root" and "root bus" controllers, just a "pci-root" (for i440FX) or "pcie-root" (for q35), both of which provide a "pci" bus (I'm using the term loosely here), each with different restrictions about what can be connected.
Yes, we could specify segments, though it's not the same as a domain as linux guests define it (I assume this is what libvirt wants to call a domain): if memory serves a segment does not have to be a root based hierarchy, linux domains are all root based.
I'm not exactly sure of the meanings/implications of all those terms, but from the point of view of libvirt, as long as we can represent all possible connections between devices using the domain:bus:slot.function notation, I think it doesn't matter too much.
We are better off not specifying BBN for all buses I think -
How would you differentiate between the different buses without some sort of identifier?
it's intended for multi-root support for legacy OSes.
"index" is confusing to me.
index is being used just because that's been the convention for other controller types - when there are multiple controllers of the same type, each is given an index, and that's used in the "child" devices to indicate which of the parent controllers they connect to.
I'd prefer ID for bus not a number, I'm concerned users will assume it's bus number and get confused by a mismatch.
So you would rather that they were something like this?
<controller type='pci' bus='pci.0'> <model type='pci-root'/> </controller> <interface type='blah'> ... <address type='pci' domain='0' bus='pci.0' slot='0' function='0'/> </interface>
The problem is that the use of numeric bus IDs is fairly deeply ingrained in libvirt; every existing libvirt guest config has device addresses specifying "bus='0'" Switching to using an alphanumeric ID rather than a simple number would require extra care to maintain backward compatibility with all those existing configs and previous versions of libvirt that might end up being the recipient of xml generated by a newer libvirt. Because of this, at the very least the pci.0 bus must be referred to as bus='0'; once we've done that, we might as well refer to them *all* numerically (anyway, even if names were allowed, I'm sure everybody would just call them '1', '2', (or at the very most "pci.1", "pci.2") etc. anyway.
>>> <interface type='direct'> <!-- NIC on host bridge 2 --> >>> ... >>> <address type='pci' domain='1' bus='0' slot='3'/> >>> </controller> >>> >>> >>> NB this means that 'index' values can be reused against the >>> <controller>, provided they are setup on different pci-roots. >>> >>>> (also note that it might happen that the bus number in libvirt's config >>>> will correspond to the bus numbering that shows up in the guest OS, but >>>> that will just be a happy coincidence) >>>> >>>> Does this make sense? >>> Yep, I think we're fairly close. >> What about the other types of pci controllers that are used by PCIe? We >> should make sure they fit in this model before we settle on it. > What do they do ? (The descriptions of different models below tell what each of these other devices does; in short, they're all just some sort of electronic Lego to help connect PCI and PCIe devices into a tree).
Okay, I'll make yet another attempt at understanding these devices, and suggesting how they can all be described in the XML. I'm thinking that *all* of the express hubs, switch ports, bridges, etc can be described in xml in the manner above, i.e.:
<controller type='pci' index='n'> <model type='xxx'/> </controller>
and that the method for connecting a device to any of them would be by specifying:
<address type='pci' domain='n' bus='n' slot='n' function='n'/>
Any limitations about which devices/controllers can connect to which controllers, and how many devices can connect to any particular controller will be derived from the <model type='xxx'/>. (And, as we've said before, although qemu doesn't assign each of these controllers a numeric bus id, and although we can make no guarantee that the bus id we use for a particular controller is what will be used by the guest BIOS/OS, it's still a convenient notation and works well with other hypervisors as well as qemu. I'll also note that when I run lspci on an X58-based machine I have here, *all* of the relationships between all the devices listed below are described with simple bus:slot.function numbers.)
Here is a list of the pci controller model types and their restrictions (thanks to mst and aw for repeating these over and over to me; I'm sure I still have made mistakes, but at least it's getting closer).
<controller type='pci-root'> ============================
Upstream: nothing Downstream: only a single pci-root-bus (implied) qemu commandline: nothing (it's implied in the q35 machinetype)
Explanation:
Each machine will have a different controller called "pci-root" as outlined above by Daniel. Two types of pci-root will be supported: i440FX and q35. If a pci-root is not spelled out in the config, one will be auto-added (depending on machinetype).
An i440FX pci-root has an implicitly added pci-bridge at 0:0:0.0 (and any bridge that has an address of slot='0' on its own bus is, by definition, connected to a pci-root controller - the two are matched by setting "domain" in the address of the pci-bridge to "index" of the pci-root). This bridge can only have PCI devices added.
A q35 pci-root also implies a different kind of pci-bridge device - one that can only have PCIe devices/controllers attached, but is otherwise identical to the pci-bridge added for i440FX. This bus will be called "root-bus" (Note that there are generally followed conventions for what can be connected to which slot on this bus, and we will probably follow those conventions when building a machine, *but* we will not hardcode this convention into libvirt; each q35 machine will be an empty slate)
<controller type='pci'> =======================
This will be used for *all* of the following controller devices supported by qemu:
<model type='pcie-root-bus'/> (implicit/integrated) ----------------------------
Upstream: connect to pci-root controller *only* Downstream: 32 slots, PCIe devices only, no hotplug. qemu commandline: nothing (implicit in the q35-* machinetype)
This controller is the bus described above that connects to a q35's pci-root, and provides places for PCIe devices to connect. Examples are root-ports, dmi-to-pci-bridges sata controllers, integrated sound/usb/ethernet devices (do any of those that can be connected to the pcie-root-bus exist yet?).
There is only one of these controllers, and it will *always* be index='0', and will always have the following address:
<address type='pci' domain='0' bus='0' slot='0' function='0'/> Implicit devices make me nervous, why wouldn't this just be a pcie-root (or pcie-host-bridge)? If we want to support multiple host bridges, there can certainly be more than one, so the index='0' assumption seems to fall apart.
That's when we need to start talking about a "domain" attribute, like this:
<controller type='pci' domain='1' index='0'> <model type='pcie-root-bus'/> </controller>
<model type='root-port'/> (ioh3420) -------------------------
Upstream: PCIe, connect to pcie-root-bus *only* (?) yes
Downstream: 1 slot, PCIe devices only (?) yes
qemu commandline: -device ioh3420,...
These can only connect to the "pcie-root-bus" of of a q35 (implying that this bus will need to have a different model name than the simple "pci-bridge"
<model type='dmi-to-pci-bridge'/> (i82801b11-bridge) I'm worried this name is either too specific or too generic. What happens when we add a generic pcie-bridge and want to use that instead of the i82801b11-bridge? The guest really only sees this as a PCIe-to-PCI bridge, it just happens that on q35 this attaches at the DMI port of the MCH.
Hehe. Just using the name you (Alex) suggested :-)
My use of the "generic" device *type* names rather than exact hardware model names is based on the idea that any given machinetype will have a set of these "building block" devices available, and as long as you use everything from the same "set" on a given machine, it doesn't really matter which set you use. Is this a valid assumption?
---------------------------------
(btw, what does "dmi" mean?) http://en.wikipedia.org/wiki/Direct_Media_Interface
Upstream: pcie-root-bus *only* And only to a specific q35 slot (1e.0) for the i82801b11-bridge.
Downstream: 32 slots, any PCI device, no hotplug (?) Yet, but I think this is where we want to implement ACPI based hotplug.
Okay, but for now libvirt can just refrain from auto-addressing any user-created devices to that bus; we'll just make sure that there is always a "pci-bridge" plugged into it, and auto-addressed devices will all be put there.
In the meantime if someone explicitly addresses a device to connect to the i82801b11-bridge, we'll let them do it, but if they try to hot-unplug it they will get an error.
qemu commandline: -device i82801b11-bridge,...
<model type='upstream-switch-port'/> (x3130-upstream) ------------------------------------
Upstream: PCIe, connect to pcie-root-bus, root-port, or downstream-switch-port (?) yes
Downstream: 32 slots, connect *only* to downstream-switch-port I can't verify that there are 32 slots, mst? I've only setup downstream ports within slot 0.
According to a discussion with Don Dutile on IRC yesterday, the downstream side of an upstream-switch-port has 32 "slots" with 8 "functions" each, and each of these functions can have a downstream-switch-port connected. That said, he told me that in every case he's seen in the real world, all the downstream-switch-ports were connected to "function 0", effectively limiting it to 32 downstreams/upstream.
qemu-commandline: -device x3130-upstream
This is the upper side of a switch that can multiplex multiple devices onto a single port. It's only useful when one or more downstream switch ports are connected to it.
<model type='downstream-switch-port'/> (xio3130-downstream) --------------------------------------
Upstream: connect *only* to upstream-switch-port Downstream: 1 slot, any PCIe device qemu commandline: -device xio3130-downstream
You can connect one or more of these to an upstream-switch-port in order to effectively plug multiple devices into a single PCIe port.
<model type='pci-bridge'/> (pci-bridge) --------------------------
Upstream: PCI, connect to 1) pci-root, 2) dmi-to-pci-bridge, 3) another pci-bridge Downstream: any PCI device, 32 slots qemu commandline: -device pci-bridge,...
This differs from dmi-to-pci-bridge in that its upstream connection is PCI rather than PCIe (so it will work on an i440FX system, which has no root PCIe bus) and that hotplug is supported. In general, if a guest will have any PCI devices, one of these controllers should be added, and
===============================================================
Comment: I'm not quite convinced that we really need the separate "pci-root" device. Since 1) every pci-root will *always* have either a pcie-root-bus or a pci-bridge connected to it, 2) the pci-root-bus will only ever be connected to the pci-root, and 3) the pci-bridge that connects to it will need special handling within the pci-bridge case anyway, why not:
1) eliminate the separate pci-root controller type
2) within <controller type='pci'>, a new <model type='pci-root-bus'/> will be added.
3) a pcie-root-bus will automatically be added for q35 machinetypes, and pci-root-bus for any machinetype that supports a PCI bus (e.g. "pc-*")
4) model type='pci-root-bus' will behave like pci-bridge, except that it will be an implicit device (nothing on qemu commandline) and it won't need an <address> element (neither will pcie-root-bus). I think they should both have a domain + bus address to make it possible to build multi-domain/multi-host bridge systems. They do not use any slots through.
Yes. I think I agree with that. But we don't have to implement the multiple-domain stuff today (since qemu doesn't support it yet), and when we do, I think we can just add a "domain" attribute to the main element of pci-root and pcie-root controllers.
5) to support multiple domains, we can simply add a "domain" attribute to the toplevel of controller.
Or this Wouldn't even be unnecessary if we supported a 'pci-root-addr' address type for the above with the default being domain=0, bus=0? I suppose it doesn't matter whether it's a separate attribute or new address type though. Thanks,
I think you're mixing up the purpose of the <address> element vs the "index" attribute in the main <controller> element. To clarify, take this example:
<controller type='pci' index='3'> <model type='pci-bridge'/> <address domain='0' bus='1' slot='9' function='0'/> </controller>
This controller is connected to slot 9 of the already-existing bus 1. It provides a bus 3 for other devices to connect to. If we wanted to start up a domain 1, we would do something like this:
<controller type='pci' domain='1' index='0'> <model type='pci-root'/> </controller>
This would give us a PCI bus 0 in domain 1. You could then connect a pci-bridge to it like this:
<controller type='pci' domain='1' index='1'> <model type='pci-bridge'/> <address type='pci' domain='1' bus='0' slot='1' function='0'/> </controller>
The <address> tells us that this new bus connects to slot 1 of PCI bus 0 in domain 1. The <controller domain='1' index='1'> tells us that there is now a new bus other devices can connect to that is at domain='1' bus='1'.
Also AFAIK there's nothing in the spec that requires bus=0 to be root. The _BBN hack above is used sometimes to give !=0 bus numbers to roots.
I don't really understand that, but do you think that 1) qemu would ever want/be able to model that, or that 2) anyone would ever have a practical reason for wanting to? It's really cool and all to be able to replicate any possible esoteric hardware configuration in a virtual machine, but it seems like the only practical use of replicating something like that would be for someone wanting to test what their OS does when there's no domain=0 in the hardware...

On Mon, 2013-04-08 at 12:37 -0400, Laine Stump wrote:
On 04/05/2013 03:26 PM, Alex Williamson wrote:
On Fri, 2013-04-05 at 14:42 -0400, Laine Stump wrote:
On 04/05/2013 01:38 PM, Daniel P. Berrange wrote:
On Fri, Apr 05, 2013 at 12:32:04PM -0400, Laine Stump wrote:
On 04/03/2013 11:50 AM, Ján Tomko wrote:
From: liguang <lig.fnst@cn.fujitsu.com>
add a new controller type, then one can define a pci-bridge controller like this: <controller type='pci-bridge' index='0'/> In the next patch we're prohibiting exactly this config (index='0') because the pre-existing pci bus on the "pc-*" machinetypes is already named pci.0. If we don't allow it, we shouldn't include it as an example in the commit log :-) NB, it isn't always named 'pci.0' - on many arches it is merely 'pci'. Yeah, I'm just using that as a convenient shorthand. The final decision on whether to use pci.0 or pci happens down in the qemuBuildCommandline().
More on this - one of the things this points out is that there is no representation in the config of the pci.0 bus, it's just assumed to always be there. That is the case for pc-* machinetypes (and probably several others with PCI buses), but for q35, there is no pci.0 bus in the basic machine, only a pcie.0; if you want a pci.0 on q35 (which *will* be necessary in order to attach any pci devices, so I imagine we will always want one), you have to attach a pcie->pci bridge, which is the device "i82801b11-bridge", to pcie.0. The reason I bring this up here, is I'm wondering:
1) should we have some representation of the default pci.0 bus in the config, even though it is just "always there" for the pc machinetypes and there is no way to disable it, and nothing on the commandline that specifies its existence? Yep, we should be aiming for the XML to fully describe the machine hardware. So since we're adding the concept of PCI controllers/bridges etc to the XML, we should be auto-adding the default bus to the XML.
2) For the q35 machinetype, should we just always add an i82801b11-bridge device and name it pci.0? Or should that need to be present in the xml? We've been burnt before auto-adding stuff that ought to have been optional. So I'd tend towards only having the minimal config that is required. If the users want this, let them explicitly ask for the bridge
Okay. This makes for a larger burden on the user/virt-manager/boxes/libvirt-designer, but does prevent us from setting up an undesirable default that we can't rescue ourselves from :-)
Also from the apps POV the QEMU device name is irrelevant. The XML config works off the PCI addresses. So there's no need to force/specialcase a i82801b11-bridge to use the name 'pci.0'.
Sure. I just mean "pci bus 0" (hmm, but actually this does point out a problem with my logic - the same namespace (well, "numbering space") is used for both pcie and pci buses, so on a q35 system, bus=0 is already taken by pcie.0; that means that the first pci bus would need to use a different bus number anyway, so it wouldn't be so easy to switch an existing domain from pc to q35 - every PCI device would need to have its bus number modified. I suppose that's reasonable to expect, though. I would think you'd want to differentiate PCI from PCIe anyway. PCI is a bus and you have 32 slots per bus to fill. PCIe is a point-to-point link and you really only have slot 0 available. Perhaps that puts them in different number spaces already.
Are you saying that it's okay to have a bus=0 for pci and a different bus=0 for pcie?
In bus=<identifier> the identifier needs to be unique, but it's not a bus #, it's just an identifier.
I was hoping that what is used in libvirt's config could mirror as closely as possible the numbering that you see in the output of lspci on the guest, but it sounds like that numbering is something done at the whim of the guest, with no basis in (standard) reality, is that right?
Correct, the the BIOS determines the initial bus numbers and it can do it however it wants. Most guests won't renumber buses, but they can if they want. It's a lost cause to expect any correlation between the libvirt bus identifier and the actual bus number.
3) Most important - depending on the answers to (1) and (2), should we maybe name this device "pci", and use a different backend depending on index and machinetype? (or alternately explicitly specifiable with a <driver> subelement). To be specific, we would have:
<controller type='pci' index='0'/>
which on pc machinetypes would just be a placeholder in the config (and always inserted if it wasn't there, for machinetypes that have a pci bus). On the q35 machinetype, that same line would equate to adding an i82801b11-bridge device (with source defaulting to bus=pcie.0,addr=1e.0). This would serve several purposes:
a) on pc machinetypes, it would be a visual aid indicating that pci.0 exists, and that index='0' isn't available for a new pci controller.
b) it would make switching a domain config from pc to q35 simpler, since pci.0 would always already be in place for attaching pci devices (including pci.1, pci.2, etc)
c) it would make the config a true complete description of the machine being created.
(I've suggested naming the controller "pci" rather than "pci-bridge" because in the case of a "root" bus like pci.0 it seems to not be a "bridge", but maybe the name "pci-bridge" is always appropriate, even when it's a root bus. Maybe someone with better pci/pcie knowledge can provide an opinion on this) I think "pci" is a little too generic - how about we call it 'pci-root' Okay, so a separate "pci-root" device along with "pci-bridge"? What I was really hoping was to have all PCI buses represented in a common way in the config. How about a controller called "pci" with different types, "root" and "bridge"? And since they use the same numbering space as pcie buses, maybe the pcie controllers (including the root and the hubs and ???) would be different types of PCI controllers. That would make it easier (i.e. *possible*) to avoid collisions in use of bus numbers.
Alex or mst, any advice/opinions on how to represent all the different q35 devices that consume bus numbers in a succinct fashion? Note that none of these are really bus numbers, they're just bus identifiers. The BIOS and the guest running define the bus numbers. "root" also has special meaning in PCI, so for instance I wouldn't name a bus behind the i82801b11-bridge "pci-root". Somehow we also need to deal with what can be attached where. For instance a pci-bridge is a PCI device and can only go on a PCI bus. The equivalent structure on PCIe is an upstream switch port with some number of downstream switch ports. Each of those are specific to the bus type.
I think we're starting to get closer to the concrete problem that's bothering me. As I understand it (and again - "what I understand" has repeatedly been shown to be incorrect in this thread :-):
* Ihere are multiple different types of devices that provide a bus with 1 or more "slots" that PCI devices (e.g., the virtio-net-pci device, the e1000 network device, etc) can be plugged into.
* In the config for those devices, there is a required (auto-generated if not explicitly provided) <address> element that indicates what controller that device is plugged into e.g.:
<interface type='direct'> ... <address type='pci' domain='0' bus='0' slot='3' function='0'/> ... </interface>
The above is not really how QEMU works. QEMU PCI devices take an addr= parameter that specifies the "slot.function". The bus= option is not numeric. That's the identifier value. So if you create a bus with: -device i82801b11-bridge,id=dmi-to-pci-bridge,addr=1e.0 Then to put a device on that bus you'd do: -device e1000,id=e1000-net-0,bus=dmi-to-pci-bridge,addr=0.0 We don't have a way to generate new domains yet, but I imagine it would require a PCI host bridge device and be a parameter to that. For instance: - device pci-host-bridge,domain=1,id=domain1-pcie.0 Which would create a new "pci-root". You would then specify a device in the domain using the same bus= notation. An e1000 on bus=pcie.0 is in domain0, but a nic on bus=domain1-pcie.0 is in domain1. This is all just speculation though.
* domain is always hardcoded to 0, and in the past bus was also always hardcoded to 0 because until now there has only been a single place where PCI devices could be connected - the builtin pci.0 bus, which is a part of the basic "pc" (and some others) virtual machine and provides 32 slots.
Again, bus= is an identifier and I would guess that it will implicitly specify the domain when we get that far. Libvirt specifying a numerical domain:bus:slot.fn and expecting the device to appear there to the guest is a flawed concept.
* Now we are adding the ability to define new PCI buses, for now just a single kind - a pci-bridge controller, which itself must connect to an existing PCI slot, and provides 32 new PCI slots. But in the future there will be more different types of controllers that provide one or more PCI slots where PCI devices/controllers can be plugged in.
* In these patches adding support for pci-bridge, we are making the assumption that there is a 1:1 correspondence between the "index='n'" attribute of the pci-bridge controller and the "bus='n'" attribute of the <address> element in devices that will be plugged into that controller. So for example if we have:
<controller type='pci-bridge' index='1'> <address type='pci' domain='0' bus='0' slot='10' function='0'/> </controller>
and then change the <interface> definition above to say "bus='1'", that interface device will plug into this new bus at slot 3.
Yes, you can do this, but there's no guarantee that the guest won't see that as bus number 7. '1' is just the name of the bus libvirt is using. It could also be named 'foo'.
* So let's assume that we add a new controller called "dmi-to-pci-bridge:
<controller type='dmi-to-pci-bridge' index='0'/>
Ignoring for now the question of what address we give in the definition of *this* device (which is itself problematic - do we need a new "pcie" address type?), if some device is then defined with
<address type='pci bus='0' .../>
How do we differentiate between that meaning "the pci-ptp controller that is index='0'" and "the pci-bridge controller that is index='0'"? Do we need to expand our <address> element further? If, as I think you suggest, we have multiple different kinds of controllers that provide PCI slots, each with its own namespace, the current pci address element is inadequate to unambiguously describe where a pci device should be plugged in.
Perhaps we should be referencing the "<alias name='nnn'/>" element of each controller in the pci address of the target device, e.g.:
<controller type='pci-bridge' index='0'> <alias name='pci.0'/> <!-- obviously on a machine with no builtin pci.0! --> </controller/> <controller type='dmi-to-pci-bridge' index='0'> <alias name='dmi-to-pci-bridge.0'/> </controller> <interface type='direct'> ... <address type='pci' controller='dmi-to-pci-bridge.0' slot='3' function='0'/> </interface>
(or, since this "controller" attribute really obsolates the numeric "bus" attribute, maybe it could be "bus='dmi-to-pci-bridge.0'", and we could continue to support "bus='0'" for legacy configs).
Yes, exactly. The id= of the controller becomes the bus= identifier for interfaces on that bus.
I believe right now the alias name is always auto-generated; we would need to make that so that when explicitly provided it would be guaranteed to never change (and if that's not possible to do in a backward compatible manner, then we need to come up with some new attribute to use in this manner)
Alternately, we could add new types to address, one for each new type of controller, then define the devices like this:
<interface type='direct'> <address type='pci-bridge' bus='0' slot='3' function='0'/> <interface <interface type='direct'> <address type='dmi-to-pci-bridge' bus='0' slot='3' function='0'/> </interface>
(yes, I know you wouldn't want to plug a network device into the dmi-to-pci-bridge directly, this is just for the sake of example)
You'll notice that this makes the bus attribute obsolete.
I'm not sure how you get multiple devices on the same bus using this model.
(side note: I know that this discussion has gone far beyond just talking about adding a single new type of controller (pci-bridge), but how we do this device will have implications far beyond, so we need to figure it out now.)
For PCIe, we create new buses for root ports (ioh3420), upstream switch ports (xio3130-upstream), downstream switch ports (xio3130-downstream), and the dmi-to-pci bridge (i82801b11-bridge). For PCI, PCI-to-PCI bridges create new buses (pci-bridge and dec-21154).
One of my goals is to move us away from emulation of specific chips and create more devices like pci-bridge that adhere to the standard, but don't try to emulate a specific device. Then we might have "root-port", "pcie-upstream-switch-port", "pcie-downstream-switch-port", and "dmi-to-pci-bridge" (none of these names have been discussed).
That makes sense to me at the level of libvirt, but in qemu don't you need to "emulate specific devices" anyway, in order for the guest OS to operate properly? If that's the case and there are different chips that implement the same functionality in a different manner, how would you decide which of those should be chosen as "the *only" dmi-to-pci-bridge"?
The "pci-bridge" is an example of a generic device. We've created our our virtual hardware that adheres to the necessary specifications but doesn't emulate a specific piece of physical hardware. Root bridges, switch ports, PCIe-to-PCI bridges, in fact even chipsets can be done the same way. These interconnect components typically use generic class drivers in the guest. If we go the next step and emulate specific devices then we also need to implement all the hardware bugs and limitations for that device as well as all the value add extensions for the device that may or may not add value on a virtual platform. For instance, why should our root ports be limited in width or speed to the same as the physical hardware? The dmi-to-pci bridge might just end up being a pcie-to-pci bridge, the dmi bus isn't really visible to the guest anyway (but on q35 we need to install it at a specific location because we're emulating specific hardware). Thanks, Alex

On Mon, Apr 08, 2013 at 04:26:19PM -0600, Alex Williamson wrote:
On Mon, 2013-04-08 at 12:37 -0400, Laine Stump wrote:
On 04/05/2013 03:26 PM, Alex Williamson wrote:
Sure. I just mean "pci bus 0" (hmm, but actually this does point out a problem with my logic - the same namespace (well, "numbering space") is used for both pcie and pci buses, so on a q35 system, bus=0 is already taken by pcie.0; that means that the first pci bus would need to use a different bus number anyway, so it wouldn't be so easy to switch an existing domain from pc to q35 - every PCI device would need to have its bus number modified. I suppose that's reasonable to expect, though. I would think you'd want to differentiate PCI from PCIe anyway. PCI is a bus and you have 32 slots per bus to fill. PCIe is a point-to-point link and you really only have slot 0 available. Perhaps that puts them in different number spaces already.
Are you saying that it's okay to have a bus=0 for pci and a different bus=0 for pcie?
In bus=<identifier> the identifier needs to be unique, but it's not a bus #, it's just an identifier.
This is mixing up QEMU syntax with libvirt syntax - we're not requiring them to be the same. Libvirt operates solely in terms of bus numbers, and will turn those into QEMU bus identifiers at the time it generates the CLI args. Indeed we explicitly do *not* want to follow QEMU's syntax in the libvirt XML, since this is no good for non-QEMU hypervisors.
I was hoping that what is used in libvirt's config could mirror as closely as possible the numbering that you see in the output of lspci on the guest, but it sounds like that numbering is something done at the whim of the guest, with no basis in (standard) reality, is that right?
Correct, the the BIOS determines the initial bus numbers and it can do it however it wants. Most guests won't renumber buses, but they can if they want. It's a lost cause to expect any correlation between the libvirt bus identifier and the actual bus number.
I don't think libvirt needs to require that its XML bus numbers match the guest OS' view of bus numbers. If they do match, great, otherwise so be it.
I think we're starting to get closer to the concrete problem that's bothering me. As I understand it (and again - "what I understand" has repeatedly been shown to be incorrect in this thread :-):
* Ihere are multiple different types of devices that provide a bus with 1 or more "slots" that PCI devices (e.g., the virtio-net-pci device, the e1000 network device, etc) can be plugged into.
* In the config for those devices, there is a required (auto-generated if not explicitly provided) <address> element that indicates what controller that device is plugged into e.g.:
<interface type='direct'> ... <address type='pci' domain='0' bus='0' slot='3' function='0'/> ... </interface>
The above is not really how QEMU works. QEMU PCI devices take an addr= parameter that specifies the "slot.function". The bus= option is not numeric. That's the identifier value. So if you create a bus with:
-device i82801b11-bridge,id=dmi-to-pci-bridge,addr=1e.0
Then to put a device on that bus you'd do:
-device e1000,id=e1000-net-0,bus=dmi-to-pci-bridge,addr=0.0
As above, that's no problem. Libvirt is perfectly capable of translating from <address type='pci' domain='0' bus='5' slot='3' function='0'/> to to whatever bus=AAA and addr=BBB args that are appropriate for QEMU.
We don't have a way to generate new domains yet, but I imagine it would require a PCI host bridge device and be a parameter to that. For instance:
- device pci-host-bridge,domain=1,id=domain1-pcie.0
Which would create a new "pci-root". You would then specify a device in the domain using the same bus= notation. An e1000 on bus=pcie.0 is in domain0, but a nic on bus=domain1-pcie.0 is in domain1. This is all just speculation though.
* domain is always hardcoded to 0, and in the past bus was also always hardcoded to 0 because until now there has only been a single place where PCI devices could be connected - the builtin pci.0 bus, which is a part of the basic "pc" (and some others) virtual machine and provides 32 slots.
Again, bus= is an identifier and I would guess that it will implicitly specify the domain when we get that far. Libvirt specifying a numerical domain:bus:slot.fn and expecting the device to appear there to the guest is a flawed concept.
No, libvirt won't be using addr=domain:bus:slot.fun. it will use whatever bus=AAA and addr=slot.fun pairing is required for QEMU. Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|

On 04/03/2013 11:50 AM, Ján Tomko wrote:
From: liguang <lig.fnst@cn.fujitsu.com>
add a new controller type, then one can define a pci-bridge controller like this: <controller type='pci-bridge' index='0'/> <controller type='pci-bridge' index='1'> <address type='pci' domain='0x0000' bus='0x00' slot='0x05' function='0x0'/> </controller> actually, it works as a pci-bus, so as to support multi-pci-bus via pci-to-pci bridge
Okay. I think the result of all the discussion started by this patch is that, for the current functionality, we need to do what is in the 4 steps at the bottom of this message: https://www.redhat.com/archives/libvir-list/2013-April/msg01144.html 1) There will be a new <controller type='pci'> device, with a <model type='xyz'/> subelement. Initially we will support types "pci-root" and "pci-bridge" (all the other types discussed above can be added later). pci-root will have *no <address>* element (and will generate nothing on the qemu commandline, but will create a 32 slot "bus='0'" to plug PCI devices into). pci-bridge will have an <address> element, will generate a -device option on the qemu commandline, and will also create a 32 slot "bus='n'" to plug PCI devices into. 2) for machinetypes that have a PCI bus, the config should have this controller auto-added: <controller type='pci'> <model type='pci-root'/> </controller> This will make bus='0' available (but add nothing to the qemu commandline). Any attempt to add a PCI device when there is no bus available should be an error. 3) The way to add more buses will be to add a controller like this: <controller type='pci'> <model type='pci-bridge'/> </controller> 4) When <controller type='usb'> was added, resulting in auto-generated devices, that caused problems when migrating from a host with newer libvirt to one with older libvirt. We need to make sure we don't suffer the same problem this time. See the following two BZes for details (unless you have a better memory than me! :-): https://bugzilla.redhat.com/show_bug.cgi?id=815503 https://bugzilla.redhat.com/show_bug.cgi?id=856864

From: liguang <lig.fnst@cn.fujitsu.com> --- src/qemu/qemu_capabilities.c | 2 ++ src/qemu/qemu_capabilities.h | 1 + src/qemu/qemu_command.c | 15 ++++++++++++++- tests/qemuhelptest.c | 21 ++++++++++++++------- 4 files changed, 31 insertions(+), 8 deletions(-) diff --git a/src/qemu/qemu_capabilities.c b/src/qemu/qemu_capabilities.c index aa381b4..4377e08 100644 --- a/src/qemu/qemu_capabilities.c +++ b/src/qemu/qemu_capabilities.c @@ -216,6 +216,7 @@ VIR_ENUM_IMPL(virQEMUCaps, QEMU_CAPS_LAST, "ipv6-migration", /* 135 */ "machine-opt", + "pci-bridge", ); struct _virQEMUCaps { @@ -1357,6 +1358,7 @@ struct virQEMUCapsStringFlags virQEMUCapsObjectTypes[] = { { "virtio-rng-ccw", QEMU_CAPS_DEVICE_VIRTIO_RNG }, { "rng-random", QEMU_CAPS_OBJECT_RNG_RANDOM }, { "rng-egd", QEMU_CAPS_OBJECT_RNG_EGD }, + { "pci-bridge", QEMU_CAPS_DEVICE_PCI_BRIDGE }, }; static struct virQEMUCapsStringFlags virQEMUCapsObjectPropsVirtioBlk[] = { diff --git a/src/qemu/qemu_capabilities.h b/src/qemu/qemu_capabilities.h index b2dc588..e3bba52 100644 --- a/src/qemu/qemu_capabilities.h +++ b/src/qemu/qemu_capabilities.h @@ -176,6 +176,7 @@ enum virQEMUCapsFlags { QEMU_CAPS_SCSI_MEGASAS = 134, /* -device megasas */ QEMU_CAPS_IPV6_MIGRATION = 135, /* -incoming [::] */ QEMU_CAPS_MACHINE_OPT = 136, /* -machine xxxx*/ + QEMU_CAPS_DEVICE_PCI_BRIDGE = 137, /* -device pci-bridge */ QEMU_CAPS_LAST, /* this must always be the last item */ }; diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c index e221c82..7817b13 100644 --- a/src/qemu/qemu_command.c +++ b/src/qemu/qemu_command.c @@ -1972,7 +1972,9 @@ qemuBuildDeviceAddressStr(virBufferPtr buf, * When QEMU grows support for > 1 PCI domain, then pci.0 change * to pciNN.0 where NN is the domain number */ - if (virQEMUCapsGet(qemuCaps, QEMU_CAPS_PCI_MULTIBUS)) + if (virQEMUCapsGet(qemuCaps, QEMU_CAPS_DEVICE_PCI_BRIDGE)) + virBufferAsprintf(buf, ",bus=pci.%u", info->addr.pci.bus); + else if (virQEMUCapsGet(qemuCaps, QEMU_CAPS_PCI_MULTIBUS)) virBufferAsprintf(buf, ",bus=pci.0"); else virBufferAsprintf(buf, ",bus=pci"); @@ -3576,6 +3578,16 @@ qemuBuildControllerDevStr(virDomainDefPtr domainDef, break; + case VIR_DOMAIN_CONTROLLER_TYPE_PCI_BRIDGE: + if (def->idx == 0) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("PCI bridge index should be > 0")); + goto error; + } + virBufferAsprintf(&buf, "pci-bridge,chassis_nr=%d,id=pci.%d", + def->idx, def->idx); + break; + /* We always get an IDE controller, whether we want it or not. */ case VIR_DOMAIN_CONTROLLER_TYPE_IDE: default: @@ -5674,6 +5686,7 @@ qemuBuildCommandLine(virConnectPtr conn, /* We don't add an explicit IDE or FD controller because the * provided PIIX4 device already includes one. It isn't possible to * remove the PIIX4. */ + VIR_DOMAIN_CONTROLLER_TYPE_PCI_BRIDGE, VIR_DOMAIN_CONTROLLER_TYPE_USB, VIR_DOMAIN_CONTROLLER_TYPE_SCSI, VIR_DOMAIN_CONTROLLER_TYPE_SATA, diff --git a/tests/qemuhelptest.c b/tests/qemuhelptest.c index 43774f4..f0181ce 100644 --- a/tests/qemuhelptest.c +++ b/tests/qemuhelptest.c @@ -397,7 +397,8 @@ mymain(void) QEMU_CAPS_DEVICE_CIRRUS_VGA, QEMU_CAPS_DEVICE_VMWARE_SVGA, QEMU_CAPS_DEVICE_USB_SERIAL, - QEMU_CAPS_DEVICE_USB_NET); + QEMU_CAPS_DEVICE_USB_NET, + QEMU_CAPS_DEVICE_PCI_BRIDGE); DO_TEST("qemu-kvm-0.12.3", 12003, 1, 0, QEMU_CAPS_VNC_COLON, QEMU_CAPS_NO_REBOOT, @@ -506,7 +507,8 @@ mymain(void) QEMU_CAPS_DEVICE_CIRRUS_VGA, QEMU_CAPS_DEVICE_VMWARE_SVGA, QEMU_CAPS_DEVICE_USB_SERIAL, - QEMU_CAPS_DEVICE_USB_NET); + QEMU_CAPS_DEVICE_USB_NET, + QEMU_CAPS_DEVICE_PCI_BRIDGE); DO_TEST("qemu-kvm-0.12.1.2-rhel61", 12001, 1, 0, QEMU_CAPS_VNC_COLON, QEMU_CAPS_NO_REBOOT, @@ -571,7 +573,8 @@ mymain(void) QEMU_CAPS_DEVICE_CIRRUS_VGA, QEMU_CAPS_DEVICE_VMWARE_SVGA, QEMU_CAPS_DEVICE_USB_SERIAL, - QEMU_CAPS_DEVICE_USB_NET); + QEMU_CAPS_DEVICE_USB_NET, + QEMU_CAPS_DEVICE_PCI_BRIDGE); DO_TEST("qemu-kvm-0.12.1.2-rhel62-beta", 12001, 1, 0, QEMU_CAPS_VNC_COLON, QEMU_CAPS_NO_REBOOT, @@ -643,7 +646,8 @@ mymain(void) QEMU_CAPS_VNC, QEMU_CAPS_DEVICE_QXL, QEMU_CAPS_DEVICE_VGA, - QEMU_CAPS_DEVICE_CIRRUS_VGA); + QEMU_CAPS_DEVICE_CIRRUS_VGA, + QEMU_CAPS_DEVICE_PCI_BRIDGE); DO_TEST("qemu-1.0", 1000000, 0, 0, QEMU_CAPS_VNC_COLON, QEMU_CAPS_NO_REBOOT, @@ -815,7 +819,8 @@ mymain(void) QEMU_CAPS_DEVICE_USB_SERIAL, QEMU_CAPS_DEVICE_USB_NET, QEMU_CAPS_DTB, - QEMU_CAPS_IPV6_MIGRATION); + QEMU_CAPS_IPV6_MIGRATION, + QEMU_CAPS_DEVICE_PCI_BRIDGE); DO_TEST("qemu-1.2.0", 1002000, 0, 0, QEMU_CAPS_VNC_COLON, QEMU_CAPS_NO_REBOOT, @@ -918,7 +923,8 @@ mymain(void) QEMU_CAPS_DEVICE_USB_NET, QEMU_CAPS_DTB, QEMU_CAPS_SCSI_MEGASAS, - QEMU_CAPS_IPV6_MIGRATION); + QEMU_CAPS_IPV6_MIGRATION, + QEMU_CAPS_DEVICE_PCI_BRIDGE); DO_TEST("qemu-kvm-1.2.0", 1002000, 1, 0, QEMU_CAPS_VNC_COLON, QEMU_CAPS_NO_REBOOT, @@ -1026,7 +1032,8 @@ mymain(void) QEMU_CAPS_DEVICE_USB_NET, QEMU_CAPS_DTB, QEMU_CAPS_SCSI_MEGASAS, - QEMU_CAPS_IPV6_MIGRATION); + QEMU_CAPS_IPV6_MIGRATION, + QEMU_CAPS_DEVICE_PCI_BRIDGE); return ret == 0 ? EXIT_SUCCESS : EXIT_FAILURE; } -- 1.8.1.5

On 04/03/2013 11:50 AM, Ján Tomko wrote:
From: liguang <lig.fnst@cn.fujitsu.com>
--- src/qemu/qemu_capabilities.c | 2 ++ src/qemu/qemu_capabilities.h | 1 + src/qemu/qemu_command.c | 15 ++++++++++++++- tests/qemuhelptest.c | 21 ++++++++++++++------- 4 files changed, 31 insertions(+), 8 deletions(-)
diff --git a/src/qemu/qemu_capabilities.c b/src/qemu/qemu_capabilities.c index aa381b4..4377e08 100644 --- a/src/qemu/qemu_capabilities.c +++ b/src/qemu/qemu_capabilities.c @@ -216,6 +216,7 @@ VIR_ENUM_IMPL(virQEMUCaps, QEMU_CAPS_LAST,
"ipv6-migration", /* 135 */ "machine-opt", + "pci-bridge", );
struct _virQEMUCaps { @@ -1357,6 +1358,7 @@ struct virQEMUCapsStringFlags virQEMUCapsObjectTypes[] = { { "virtio-rng-ccw", QEMU_CAPS_DEVICE_VIRTIO_RNG }, { "rng-random", QEMU_CAPS_OBJECT_RNG_RANDOM }, { "rng-egd", QEMU_CAPS_OBJECT_RNG_EGD }, + { "pci-bridge", QEMU_CAPS_DEVICE_PCI_BRIDGE }, };
static struct virQEMUCapsStringFlags virQEMUCapsObjectPropsVirtioBlk[] = { diff --git a/src/qemu/qemu_capabilities.h b/src/qemu/qemu_capabilities.h index b2dc588..e3bba52 100644 --- a/src/qemu/qemu_capabilities.h +++ b/src/qemu/qemu_capabilities.h @@ -176,6 +176,7 @@ enum virQEMUCapsFlags { QEMU_CAPS_SCSI_MEGASAS = 134, /* -device megasas */ QEMU_CAPS_IPV6_MIGRATION = 135, /* -incoming [::] */ QEMU_CAPS_MACHINE_OPT = 136, /* -machine xxxx*/ + QEMU_CAPS_DEVICE_PCI_BRIDGE = 137, /* -device pci-bridge */
QEMU_CAPS_LAST, /* this must always be the last item */ }; diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c index e221c82..7817b13 100644 --- a/src/qemu/qemu_command.c +++ b/src/qemu/qemu_command.c @@ -1972,7 +1972,9 @@ qemuBuildDeviceAddressStr(virBufferPtr buf, * When QEMU grows support for > 1 PCI domain, then pci.0 change * to pciNN.0 where NN is the domain number */ - if (virQEMUCapsGet(qemuCaps, QEMU_CAPS_PCI_MULTIBUS)) + if (virQEMUCapsGet(qemuCaps, QEMU_CAPS_DEVICE_PCI_BRIDGE)) + virBufferAsprintf(buf, ",bus=pci.%u", info->addr.pci.bus); + else if (virQEMUCapsGet(qemuCaps, QEMU_CAPS_PCI_MULTIBUS)) virBufferAsprintf(buf, ",bus=pci.0"); else virBufferAsprintf(buf, ",bus=pci");
The above looks like it would just cover up a configuration problem - if info->addr.pci.bus is > 0, then QEMU_CAPS_DEVICE_PCI_BRIDGE had better be true, otherwise it's an error in the config (and I assume that if DEVICE_PCI_BRIDGE is true, then soe is PCI_MULTIBUS, correct?). In this case we shouldn't just silently use bus=pci.0 or bus=pci, we should log an error: if (virQEMUCapsGet(qemuCaps, QEMUCAPS_DEVICE_PCI_BRIDGE)) { virBufferAsprintf(buf, ",bus=pci.%u", info->addr.pci.bus); } else { if (info->addr.pci.bus != 0) { virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", _("Only PCI device addresses with bus=0 " "are supported with this QEMU binary")); return -1; } if (virQEMUCapsGet(qemuCaps, QEMU_CAPS_PCI_MULTIBUS)) virBufferAddLit(buf, ",bus=pci.0"); else virBufferAddLit(buf, ",bus=pci"); }
@@ -3576,6 +3578,16 @@ qemuBuildControllerDevStr(virDomainDefPtr domainDef,
break;
+ case VIR_DOMAIN_CONTROLLER_TYPE_PCI_BRIDGE: + if (def->idx == 0) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("PCI bridge index should be > 0")); + goto error; + } + virBufferAsprintf(&buf, "pci-bridge,chassis_nr=%d,id=pci.%d", + def->idx, def->idx); + break; +
Depending on what others think about my comments in 4/7, when we see a bridge with index='0', instead of erroring out, we may want to: 1) ignore it if machinetype is pc-* 2) generate a commandline for an i82801b11-bridge device attached to pcie.0 if machinetype is q35-*
/* We always get an IDE controller, whether we want it or not. */ case VIR_DOMAIN_CONTROLLER_TYPE_IDE: default: @@ -5674,6 +5686,7 @@ qemuBuildCommandLine(virConnectPtr conn, /* We don't add an explicit IDE or FD controller because the * provided PIIX4 device already includes one. It isn't possible to * remove the PIIX4. */ + VIR_DOMAIN_CONTROLLER_TYPE_PCI_BRIDGE, VIR_DOMAIN_CONTROLLER_TYPE_USB, VIR_DOMAIN_CONTROLLER_TYPE_SCSI, VIR_DOMAIN_CONTROLLER_TYPE_SATA, diff --git a/tests/qemuhelptest.c b/tests/qemuhelptest.c index 43774f4..f0181ce 100644 --- a/tests/qemuhelptest.c +++ b/tests/qemuhelptest.c @@ -397,7 +397,8 @@ mymain(void) QEMU_CAPS_DEVICE_CIRRUS_VGA, QEMU_CAPS_DEVICE_VMWARE_SVGA, QEMU_CAPS_DEVICE_USB_SERIAL, - QEMU_CAPS_DEVICE_USB_NET); + QEMU_CAPS_DEVICE_USB_NET, + QEMU_CAPS_DEVICE_PCI_BRIDGE); DO_TEST("qemu-kvm-0.12.3", 12003, 1, 0, QEMU_CAPS_VNC_COLON, QEMU_CAPS_NO_REBOOT, @@ -506,7 +507,8 @@ mymain(void) QEMU_CAPS_DEVICE_CIRRUS_VGA, QEMU_CAPS_DEVICE_VMWARE_SVGA, QEMU_CAPS_DEVICE_USB_SERIAL, - QEMU_CAPS_DEVICE_USB_NET); + QEMU_CAPS_DEVICE_USB_NET, + QEMU_CAPS_DEVICE_PCI_BRIDGE); DO_TEST("qemu-kvm-0.12.1.2-rhel61", 12001, 1, 0, QEMU_CAPS_VNC_COLON, QEMU_CAPS_NO_REBOOT, @@ -571,7 +573,8 @@ mymain(void) QEMU_CAPS_DEVICE_CIRRUS_VGA, QEMU_CAPS_DEVICE_VMWARE_SVGA, QEMU_CAPS_DEVICE_USB_SERIAL, - QEMU_CAPS_DEVICE_USB_NET); + QEMU_CAPS_DEVICE_USB_NET, + QEMU_CAPS_DEVICE_PCI_BRIDGE); DO_TEST("qemu-kvm-0.12.1.2-rhel62-beta", 12001, 1, 0, QEMU_CAPS_VNC_COLON, QEMU_CAPS_NO_REBOOT, @@ -643,7 +646,8 @@ mymain(void) QEMU_CAPS_VNC, QEMU_CAPS_DEVICE_QXL, QEMU_CAPS_DEVICE_VGA, - QEMU_CAPS_DEVICE_CIRRUS_VGA); + QEMU_CAPS_DEVICE_CIRRUS_VGA, + QEMU_CAPS_DEVICE_PCI_BRIDGE); DO_TEST("qemu-1.0", 1000000, 0, 0, QEMU_CAPS_VNC_COLON, QEMU_CAPS_NO_REBOOT, @@ -815,7 +819,8 @@ mymain(void) QEMU_CAPS_DEVICE_USB_SERIAL, QEMU_CAPS_DEVICE_USB_NET, QEMU_CAPS_DTB, - QEMU_CAPS_IPV6_MIGRATION); + QEMU_CAPS_IPV6_MIGRATION, + QEMU_CAPS_DEVICE_PCI_BRIDGE); DO_TEST("qemu-1.2.0", 1002000, 0, 0, QEMU_CAPS_VNC_COLON, QEMU_CAPS_NO_REBOOT, @@ -918,7 +923,8 @@ mymain(void) QEMU_CAPS_DEVICE_USB_NET, QEMU_CAPS_DTB, QEMU_CAPS_SCSI_MEGASAS, - QEMU_CAPS_IPV6_MIGRATION); + QEMU_CAPS_IPV6_MIGRATION, + QEMU_CAPS_DEVICE_PCI_BRIDGE); DO_TEST("qemu-kvm-1.2.0", 1002000, 1, 0, QEMU_CAPS_VNC_COLON, QEMU_CAPS_NO_REBOOT, @@ -1026,7 +1032,8 @@ mymain(void) QEMU_CAPS_DEVICE_USB_NET, QEMU_CAPS_DTB, QEMU_CAPS_SCSI_MEGASAS, - QEMU_CAPS_IPV6_MIGRATION); + QEMU_CAPS_IPV6_MIGRATION, + QEMU_CAPS_DEVICE_PCI_BRIDGE);
return ret == 0 ? EXIT_SUCCESS : EXIT_FAILURE; }
Otherwise seems fine.

On 04/03/2013 11:50 AM, Ján Tomko wrote:
From: liguang <lig.fnst@cn.fujitsu.com>
--- src/qemu/qemu_capabilities.c | 2 ++ src/qemu/qemu_capabilities.h | 1 + src/qemu/qemu_command.c | 15 ++++++++++++++- tests/qemuhelptest.c | 21 ++++++++++++++------- 4 files changed, 31 insertions(+), 8 deletions(-)
diff --git a/src/qemu/qemu_capabilities.c b/src/qemu/qemu_capabilities.c index aa381b4..4377e08 100644 --- a/src/qemu/qemu_capabilities.c +++ b/src/qemu/qemu_capabilities.c @@ -216,6 +216,7 @@ VIR_ENUM_IMPL(virQEMUCaps, QEMU_CAPS_LAST,
"ipv6-migration", /* 135 */ "machine-opt", + "pci-bridge", );
struct _virQEMUCaps { @@ -1357,6 +1358,7 @@ struct virQEMUCapsStringFlags virQEMUCapsObjectTypes[] = { { "virtio-rng-ccw", QEMU_CAPS_DEVICE_VIRTIO_RNG }, { "rng-random", QEMU_CAPS_OBJECT_RNG_RANDOM }, { "rng-egd", QEMU_CAPS_OBJECT_RNG_EGD }, + { "pci-bridge", QEMU_CAPS_DEVICE_PCI_BRIDGE }, };
static struct virQEMUCapsStringFlags virQEMUCapsObjectPropsVirtioBlk[] = { diff --git a/src/qemu/qemu_capabilities.h b/src/qemu/qemu_capabilities.h index b2dc588..e3bba52 100644 --- a/src/qemu/qemu_capabilities.h +++ b/src/qemu/qemu_capabilities.h @@ -176,6 +176,7 @@ enum virQEMUCapsFlags { QEMU_CAPS_SCSI_MEGASAS = 134, /* -device megasas */ QEMU_CAPS_IPV6_MIGRATION = 135, /* -incoming [::] */ QEMU_CAPS_MACHINE_OPT = 136, /* -machine xxxx*/ + QEMU_CAPS_DEVICE_PCI_BRIDGE = 137, /* -device pci-bridge */
QEMU_CAPS_LAST, /* this must always be the last item */ };
Another problem - I don't see anywhere that QEMU_CAPS_DEVICE_PCI_BRIDGE is being turned on.

On 04/17/2013 01:58 PM, Laine Stump wrote:
On 04/03/2013 11:50 AM, Ján Tomko wrote:
From: liguang <lig.fnst@cn.fujitsu.com>
--- src/qemu/qemu_capabilities.c | 2 ++ src/qemu/qemu_capabilities.h | 1 + src/qemu/qemu_command.c | 15 ++++++++++++++- tests/qemuhelptest.c | 21 ++++++++++++++------- 4 files changed, 31 insertions(+), 8 deletions(-)
diff --git a/src/qemu/qemu_capabilities.c b/src/qemu/qemu_capabilities.c index aa381b4..4377e08 100644 --- a/src/qemu/qemu_capabilities.c +++ b/src/qemu/qemu_capabilities.c @@ -216,6 +216,7 @@ VIR_ENUM_IMPL(virQEMUCaps, QEMU_CAPS_LAST,
"ipv6-migration", /* 135 */ "machine-opt", + "pci-bridge", );
struct _virQEMUCaps { @@ -1357,6 +1358,7 @@ struct virQEMUCapsStringFlags virQEMUCapsObjectTypes[] = { { "virtio-rng-ccw", QEMU_CAPS_DEVICE_VIRTIO_RNG }, { "rng-random", QEMU_CAPS_OBJECT_RNG_RANDOM }, { "rng-egd", QEMU_CAPS_OBJECT_RNG_EGD }, + { "pci-bridge", QEMU_CAPS_DEVICE_PCI_BRIDGE }, };
static struct virQEMUCapsStringFlags virQEMUCapsObjectPropsVirtioBlk[] = { diff --git a/src/qemu/qemu_capabilities.h b/src/qemu/qemu_capabilities.h index b2dc588..e3bba52 100644 --- a/src/qemu/qemu_capabilities.h +++ b/src/qemu/qemu_capabilities.h @@ -176,6 +176,7 @@ enum virQEMUCapsFlags { QEMU_CAPS_SCSI_MEGASAS = 134, /* -device megasas */ QEMU_CAPS_IPV6_MIGRATION = 135, /* -incoming [::] */ QEMU_CAPS_MACHINE_OPT = 136, /* -machine xxxx*/ + QEMU_CAPS_DEVICE_PCI_BRIDGE = 137, /* -device pci-bridge */
QEMU_CAPS_LAST, /* this must always be the last item */ }; Another problem - I don't see anywhere that QEMU_CAPS_DEVICE_PCI_BRIDGE is being turned on.
Nevermind. Now I see how it works - the entry in virQEMUCapsObjectTypes[] causes a check, which will set it. *So* much simpler than the older help output parsing...

Allow specifying addresses with non-zero buses in the XML. Check that the bridge topology results in their indexes matching the PCI buses they provide. --- src/qemu/qemu_command.c | 207 +++++++++++++++++++++++++++++++++++++++++++++--- src/qemu/qemu_command.h | 3 +- 2 files changed, 196 insertions(+), 14 deletions(-) diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c index 7817b13..7073844 100644 --- a/src/qemu/qemu_command.c +++ b/src/qemu/qemu_command.c @@ -1195,6 +1195,8 @@ cleanup: typedef uint8_t _qemuDomainPCIAddressBus[QEMU_PCI_ADDRESS_LAST_SLOT]; struct _qemuDomainPCIAddressSet { _qemuDomainPCIAddressBus *used; + size_t nbuses; /* allocation of used */ + unsigned int maxbus; /* maximum used bus number */ virDevicePCIAddress lastaddr; }; @@ -1203,7 +1205,7 @@ struct _qemuDomainPCIAddressSet { * Returns -1 if the address is unusable * 0 if it's OK. */ -static int qemuPCIAddressCheck(qemuDomainPCIAddressSetPtr addrs ATTRIBUTE_UNUSED, +static int qemuPCIAddressCheck(qemuDomainPCIAddressSetPtr addrs, virDevicePCIAddressPtr addr) { if (addr->domain != 0) { @@ -1211,9 +1213,10 @@ static int qemuPCIAddressCheck(qemuDomainPCIAddressSetPtr addrs ATTRIBUTE_UNUSED _("Only PCI domain 0 is available")); return -1; } - if (addr->bus != 0) { - virReportError(VIR_ERR_XML_ERROR, "%s", - _("Only PCI bus 0 is available")); + if (addr->bus >= addrs->nbuses) { + virReportError(VIR_ERR_XML_ERROR, _("Only PCI buses up to %u are" + " available"), + (unsigned int) addrs->nbuses - 1); return -1; } if (addr->function >= QEMU_PCI_ADDRESS_LAST_FUNCTION) { @@ -1228,9 +1231,46 @@ static int qemuPCIAddressCheck(qemuDomainPCIAddressSetPtr addrs ATTRIBUTE_UNUSED QEMU_PCI_ADDRESS_LAST_SLOT); return -1; } + if (addr->slot == 0) { + if (addr->bus) { + virReportError(VIR_ERR_XML_ERROR, "%s", + _("Slot 0 is unusable on PCI bridges")); + return -1; + } else { + virReportError(VIR_ERR_XML_ERROR, "%s", + _("Slot 0 on bus 0 is reserved for the host bridge")); + return -1; + } + } + if (addr->bus > addrs->maxbus) + addrs->maxbus = addr->bus; return 0; } +/* grows the address set to fit addr in + * -1 = OOM + * 0 = no action required + * >0 = number of buses added + */ +static int qemuPCIAddressSetGrow(qemuDomainPCIAddressSetPtr addrs, + virDevicePCIAddressPtr addr) +{ + int add, i; + + add = addr->bus - addrs->nbuses + 1; + i = addrs->nbuses; + if (add <= 0) + return 0; + if (VIR_EXPAND_N(addrs->used, addrs->nbuses, add) < 0) { + virReportOOMError(); + return -1; + } + /* reserve slot 0 on the new buses */ + for (; i < addrs->nbuses; i++) + addrs->used[i][0] = 0xFF; + return add; +} + static char *qemuPCIAddressAsString(virDevicePCIAddressPtr addr) { @@ -1268,6 +1308,9 @@ static int qemuCollectPCIAddress(virDomainDefPtr def ATTRIBUTE_UNUSED, return 0; } + if (qemuPCIAddressSetGrow(addrs, addr) < 0) + return -1; + if (qemuPCIAddressCheck(addrs, addr) < 0) return -1; @@ -1311,6 +1354,124 @@ cleanup: } +typedef struct _qemuDomainPCIBridge qemuDomainPCIBridge; +struct _qemuDomainPCIBridge { + int idx; + int slot; +}; + + +/* Recursively check if PCI bridge indexes match numbers of the buses + * they provide. + * + * ptr: [nbuses][32] array of qemuDomainPCIBridges sorted by slot number + * bus: bus where to start checking + * start: the index the first bridge on that bus should have + * nbuses: number of buses in the ptr array + * + * Returns -1 if there is a mismatch + * The number of buses provided so far otherwise. + */ +static int +qemuDomainVerifyPCIBridgesRecursive(qemuDomainPCIBridge **ptr, + unsigned int bus, + unsigned int start, + unsigned int nbuses) +{ + int i, idx; + int cur = start; + + if (bus >= nbuses) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("More bridges than buses")); + return -1; + } + for (i = 0; i < QEMU_PCI_ADDRESS_LAST_SLOT; i++) { + idx = ptr[bus][i].idx; + /* no more bridges on this bus? */ + if (!idx) + return cur; + if (idx != cur) { + virReportError(VIR_ERR_XML_ERROR, + _("PCI bridge index %d doesn't match" + " expected index %d"), idx, cur); + return -1; + } + cur++; + if ((cur = qemuDomainVerifyPCIBridgesRecursive(ptr, idx, cur, + nbuses)) < 0) + return -1; + } + return cur; +} + + +/* + * Verify PCI bridge topology + */ +static int +qemuDomainVerifyPCIBridges(virDomainDefPtr def, + unsigned int nbuses) +{ + qemuDomainPCIBridge **ptr; + int i, j, ret = -1; + size_t tmp = 0; + int rv; + + if (VIR_ALLOC_N(ptr, nbuses) < 0) { + virReportOOMError(); + return -1; + } + + for (i = 0; i < nbuses; i++) { + if (VIR_ALLOC_N(ptr[i], QEMU_PCI_ADDRESS_LAST_SLOT) < 0) { + virReportOOMError(); + goto cleanup; + } + } + + for (i = 0; i < def->ncontrollers; i++) { + /* go through all PCI bridges defined in the domain */ + virDomainControllerDefPtr cdef = def->controllers[i]; + if (cdef->type == VIR_DOMAIN_CONTROLLER_TYPE_PCI_BRIDGE) { + unsigned int bus = cdef->info.addr.pci.bus; + unsigned int slot = cdef->info.addr.pci.slot; + qemuDomainPCIBridge br = { + .slot = slot, + .idx = cdef->idx + }; + + if (bus >= nbuses) { + virReportError(VIR_ERR_INTERNAL_ERROR, + "%s", + _("bridge is on a non-existent bus")); + goto cleanup; + } + + /* sort PCI bridges by slot number */ + for (j = 0; j < QEMU_PCI_ADDRESS_LAST_SLOT; j++) { + if (!ptr[bus][j].idx || ptr[bus][j].slot > slot) + break; + } + ignore_value(VIR_INSERT_ELEMENT_INPLACE(ptr[bus], j, tmp, br)); + } + } + + rv = qemuDomainVerifyPCIBridgesRecursive(ptr, 0, 1, nbuses); + if (rv == nbuses) { + ret = 0; + } else if (rv > 0) { + virReportError(VIR_ERR_XML_ERROR, _("not enough PCI bridges for %u" + " buses"), nbuses); + } +cleanup: + for (i = 0; i < nbuses; i++) + VIR_FREE(ptr[i]); + VIR_FREE(ptr); + return ret; +} + + int qemuDomainAssignPCIAddresses(virDomainDefPtr def, virQEMUCapsPtr qemuCaps, @@ -1321,11 +1482,20 @@ qemuDomainAssignPCIAddresses(virDomainDefPtr def, qemuDomainObjPrivatePtr priv = NULL; if (virQEMUCapsGet(qemuCaps, QEMU_CAPS_DEVICE)) { - if (!(addrs = qemuDomainPCIAddressSetCreate(def))) + if (!(addrs = qemuDomainPCIAddressSetCreate(def, 1))) goto cleanup; if (qemuAssignDevicePCISlots(def, qemuCaps, addrs) < 0) goto cleanup; + + if (virQEMUCapsGet(qemuCaps, QEMU_CAPS_DEVICE_PCI_BRIDGE)) { + if (qemuDomainVerifyPCIBridges(def, addrs->maxbus + 1) < 0) + goto cleanup; + } else if (addrs->maxbus) { + virReportError(VIR_ERR_XML_ERROR, "%s", + _("Only PCI bus 0 is available")); + goto cleanup; + } } if (obj && obj->privateData) { @@ -1366,15 +1536,23 @@ int qemuDomainAssignAddresses(virDomainDefPtr def, return qemuDomainAssignPCIAddresses(def, qemuCaps, obj); } -qemuDomainPCIAddressSetPtr qemuDomainPCIAddressSetCreate(virDomainDefPtr def) +qemuDomainPCIAddressSetPtr qemuDomainPCIAddressSetCreate(virDomainDefPtr def, + unsigned int nbuses) { qemuDomainPCIAddressSetPtr addrs; + int i; if (VIR_ALLOC(addrs) < 0) goto no_memory; - if (VIR_ALLOC_N(addrs->used, 1) < 0) + if (VIR_ALLOC_N(addrs->used, nbuses) < 0) goto no_memory; + addrs->nbuses = nbuses; + + /* reserve slot 0 in every bus - it's used by the host bridge on bus 0 + * and unusable on PCI bridges */ + for (i = 0; i < nbuses; i++) + addrs->used[i][0] = 0xFF; if (virDomainDeviceInfoIterate(def, qemuCollectPCIAddress, addrs) < 0) goto error; @@ -1409,6 +1587,9 @@ int qemuDomainPCIAddressReserveAddr(qemuDomainPCIAddressSetPtr addrs, { char *str; + if (qemuPCIAddressSetGrow(addrs, addr) < 0) + return -1; + if (qemuPCIAddressCheck(addrs, addr) < 0) return -1; @@ -1439,6 +1620,9 @@ int qemuDomainPCIAddressReserveSlot(qemuDomainPCIAddressSetPtr addrs, { char *str; + if (qemuPCIAddressSetGrow(addrs, addr) < 0) + return -1; + if (qemuPCIAddressCheck(addrs, addr) < 0) return -1; @@ -1524,7 +1708,9 @@ qemuDomainPCIAddressGetNextSlot(qemuDomainPCIAddressSetPtr addrs, tmp_addr.slot++; for (i = 0; i < QEMU_PCI_ADDRESS_LAST_SLOT; i++, tmp_addr.slot++) { if (QEMU_PCI_ADDRESS_LAST_SLOT <= tmp_addr.slot) { - tmp_addr.slot = 0; + /* slot 0 is unusable */ + tmp_addr.slot = 1; + i++; } if (!(addr = qemuPCIAddressAsString(&tmp_addr))) @@ -1620,11 +1806,6 @@ qemuAssignDevicePCISlots(virDomainDefPtr def, unsigned int *func = &tmp_addr.function; - /* Reserve slot 0 for the host bridge */ - memset(&tmp_addr, 0, sizeof(tmp_addr)); - if (qemuDomainPCIAddressReserveSlot(addrs, &tmp_addr) < 0) - goto error; - /* Verify that first IDE and USB controllers (if any) is on the PIIX3, fn 1 */ for (i = 0; i < def->ncontrollers ; i++) { /* First IDE controller lives on the PIIX3 at slot=1, function=1 */ diff --git a/src/qemu/qemu_command.h b/src/qemu/qemu_command.h index 17687f4..56da69d 100644 --- a/src/qemu/qemu_command.h +++ b/src/qemu/qemu_command.h @@ -196,7 +196,8 @@ int qemuDomainAssignSpaprVIOAddresses(virDomainDefPtr def, int qemuDomainAssignPCIAddresses(virDomainDefPtr def, virQEMUCapsPtr qemuCaps, virDomainObjPtr obj); -qemuDomainPCIAddressSetPtr qemuDomainPCIAddressSetCreate(virDomainDefPtr def); +qemuDomainPCIAddressSetPtr qemuDomainPCIAddressSetCreate(virDomainDefPtr def, + unsigned int nbuses); int qemuDomainPCIAddressReserveSlot(qemuDomainPCIAddressSetPtr addrs, virDevicePCIAddressPtr addr); int qemuDomainPCIAddressReserveAddr(qemuDomainPCIAddressSetPtr addrs, -- 1.8.1.5

Add a "dry run" address allocation to figure out how many bridges will be needed for all the devices without explicit addresses. Auto-add just enough bridges to put all the devices on, or up to the bridge with the largest specified index. --- src/qemu/qemu_command.c | 138 ++++++++++++++++++++++++++++++++++++++---------- src/qemu/qemu_command.h | 3 +- 2 files changed, 111 insertions(+), 30 deletions(-) diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c index 7073844..19fdf39 100644 --- a/src/qemu/qemu_command.c +++ b/src/qemu/qemu_command.c @@ -1197,6 +1197,8 @@ struct _qemuDomainPCIAddressSet { _qemuDomainPCIAddressBus *used; size_t nbuses; /* allocation of used */ unsigned int maxbus; /* maximum used bus number */ + bool dryRun; /* on a dry run, new buses are auto-added + and addresses aren't saved in device infos */ virDevicePCIAddress lastaddr; }; @@ -1308,7 +1310,7 @@ static int qemuCollectPCIAddress(virDomainDefPtr def ATTRIBUTE_UNUSED, return 0; } - if (qemuPCIAddressSetGrow(addrs, addr) < 0) + if (addrs->dryRun && qemuPCIAddressSetGrow(addrs, addr) < 0) return -1; if (qemuPCIAddressCheck(addrs, addr) < 0) @@ -1472,6 +1474,57 @@ cleanup: } +/* + * Add bridges from 1 to addrs->nbuses + * or the highest bridge index in def + */ +static int +qemuDomainMaybeAddPCIBridges(virDomainDefPtr def, + qemuDomainPCIAddressSetPtr addrs) +{ + virDomainControllerDefPtr cont = NULL; + int i; + int max = addrs->nbuses - 1; + + for (i = 0; i < def->ncontrollers; i++) { + if (def->controllers[i]->type == VIR_DOMAIN_CONTROLLER_TYPE_PCI_BRIDGE) + if (def->controllers[i]->idx > max) + max = def->controllers[i]->idx; + } + + for (max++, i = 1; i < max; i++) { + if (virDomainControllerFind(def, + VIR_DOMAIN_CONTROLLER_TYPE_PCI_BRIDGE, + i) >= 0) + continue; + + if (VIR_ALLOC(cont) < 0) + goto no_memory; + + cont->type = VIR_DOMAIN_CONTROLLER_TYPE_PCI_BRIDGE; + cont->idx = i; + cont->model = -1; + + if (virDomainControllerInsert(def, cont) < 0) + goto no_memory; + + /* This might change addrs->nbuses */ + if (qemuDomainPCIAddressSetNextAddr(addrs, &cont->info) < 0) + goto cleanup; + + if (addrs->nbuses > max) + max = addrs->nbuses; + } + return max; + +no_memory: + virReportOOMError(); +cleanup: + VIR_FREE(cont); + return -1; +} + + int qemuDomainAssignPCIAddresses(virDomainDefPtr def, virQEMUCapsPtr qemuCaps, @@ -1480,21 +1533,33 @@ qemuDomainAssignPCIAddresses(virDomainDefPtr def, int ret = -1; qemuDomainPCIAddressSetPtr addrs = NULL; qemuDomainObjPrivatePtr priv = NULL; + int nbuses = 1; if (virQEMUCapsGet(qemuCaps, QEMU_CAPS_DEVICE)) { - if (!(addrs = qemuDomainPCIAddressSetCreate(def, 1))) + if (virQEMUCapsGet(qemuCaps, QEMU_CAPS_DEVICE_PCI_BRIDGE)) { + virDomainDeviceInfo info; + /* 1st pass to figure out how many PCI bridges we need */ + if (!(addrs = qemuDomainPCIAddressSetCreate(def, 1, true))) + goto cleanup; + /* Reserve 1 extra slot for a bridge */ + if (qemuDomainPCIAddressSetNextAddr(addrs, &info) < 0) + goto cleanup; + + nbuses = qemuDomainMaybeAddPCIBridges(def, addrs); + if (nbuses < 0) + goto cleanup; + qemuDomainPCIAddressSetFree(addrs); + addrs = NULL; + } + if (!(addrs = qemuDomainPCIAddressSetCreate(def, nbuses, false))) goto cleanup; if (qemuAssignDevicePCISlots(def, qemuCaps, addrs) < 0) goto cleanup; if (virQEMUCapsGet(qemuCaps, QEMU_CAPS_DEVICE_PCI_BRIDGE)) { - if (qemuDomainVerifyPCIBridges(def, addrs->maxbus + 1) < 0) + if (qemuDomainVerifyPCIBridges(def, addrs->nbuses) < 0) goto cleanup; - } else if (addrs->maxbus) { - virReportError(VIR_ERR_XML_ERROR, "%s", - _("Only PCI bus 0 is available")); - goto cleanup; } } @@ -1537,7 +1602,8 @@ int qemuDomainAssignAddresses(virDomainDefPtr def, } qemuDomainPCIAddressSetPtr qemuDomainPCIAddressSetCreate(virDomainDefPtr def, - unsigned int nbuses) + unsigned int nbuses, + bool dryRun) { qemuDomainPCIAddressSetPtr addrs; int i; @@ -1548,6 +1614,7 @@ qemuDomainPCIAddressSetPtr qemuDomainPCIAddressSetCreate(virDomainDefPtr def, if (VIR_ALLOC_N(addrs->used, nbuses) < 0) goto no_memory; addrs->nbuses = nbuses; + addrs->dryRun = dryRun; /* reserve slot 0 in every bus - it's used by the host bridge on bus 0 * and unusable on PCI bridges */ @@ -1587,7 +1654,7 @@ int qemuDomainPCIAddressReserveAddr(qemuDomainPCIAddressSetPtr addrs, { char *str; - if (qemuPCIAddressSetGrow(addrs, addr) < 0) + if (addrs->dryRun && qemuPCIAddressSetGrow(addrs, addr) < 0) return -1; if (qemuPCIAddressCheck(addrs, addr) < 0) @@ -1620,7 +1687,7 @@ int qemuDomainPCIAddressReserveSlot(qemuDomainPCIAddressSetPtr addrs, { char *str; - if (qemuPCIAddressSetGrow(addrs, addr) < 0) + if (addrs->dryRun && qemuPCIAddressSetGrow(addrs, addr) < 0) return -1; if (qemuPCIAddressCheck(addrs, addr) < 0) @@ -1702,32 +1769,43 @@ qemuDomainPCIAddressGetNextSlot(qemuDomainPCIAddressSetPtr addrs, virDevicePCIAddressPtr next_addr) { virDevicePCIAddress tmp_addr = addrs->lastaddr; - int i; + int i,j; char *addr; tmp_addr.slot++; - for (i = 0; i < QEMU_PCI_ADDRESS_LAST_SLOT; i++, tmp_addr.slot++) { - if (QEMU_PCI_ADDRESS_LAST_SLOT <= tmp_addr.slot) { - /* slot 0 is unusable */ + for (j = 0; j < addrs->nbuses; j++, tmp_addr.bus++) { + if (addrs->nbuses <= tmp_addr.bus) { + if (addrs->dryRun) { + if (qemuPCIAddressSetGrow(addrs, &tmp_addr) < 0) + return -1; + } else { + tmp_addr.bus = 0; + } tmp_addr.slot = 1; - i++; } + for (i = 0; i < QEMU_PCI_ADDRESS_LAST_SLOT; i++, tmp_addr.slot++) { + if (QEMU_PCI_ADDRESS_LAST_SLOT <= tmp_addr.slot) { + /* slot 0 is unusable */ + tmp_addr.slot = 1; + i++; + } - if (!(addr = qemuPCIAddressAsString(&tmp_addr))) - return -1; + if (!(addr = qemuPCIAddressAsString(&tmp_addr))) + return -1; - if (qemuDomainPCIAddressCheckSlot(addrs, &tmp_addr) < 0) { - VIR_DEBUG("PCI addr %s already in use", addr); - VIR_FREE(addr); - continue; - } + if (qemuDomainPCIAddressCheckSlot(addrs, &tmp_addr) < 0) { + VIR_DEBUG("PCI addr %s already in use", addr); + VIR_FREE(addr); + continue; + } - VIR_DEBUG("Found free PCI addr %s", addr); - VIR_FREE(addr); + VIR_DEBUG("Found free PCI addr %s", addr); + VIR_FREE(addr); - addrs->lastaddr = tmp_addr; - *next_addr = tmp_addr; - return 0; + addrs->lastaddr = tmp_addr; + *next_addr = tmp_addr; + return 0; + } } virReportError(VIR_ERR_INTERNAL_ERROR, @@ -1745,8 +1823,10 @@ int qemuDomainPCIAddressSetNextAddr(qemuDomainPCIAddressSetPtr addrs, if (qemuDomainPCIAddressReserveSlot(addrs, &addr) < 0) return -1; - dev->type = VIR_DOMAIN_DEVICE_ADDRESS_TYPE_PCI; - dev->addr.pci = addr; + if (!addrs->dryRun) { + dev->type = VIR_DOMAIN_DEVICE_ADDRESS_TYPE_PCI; + dev->addr.pci = addr; + } addrs->lastaddr = addr; return 0; diff --git a/src/qemu/qemu_command.h b/src/qemu/qemu_command.h index 56da69d..0f8a248 100644 --- a/src/qemu/qemu_command.h +++ b/src/qemu/qemu_command.h @@ -197,7 +197,8 @@ int qemuDomainAssignPCIAddresses(virDomainDefPtr def, virQEMUCapsPtr qemuCaps, virDomainObjPtr obj); qemuDomainPCIAddressSetPtr qemuDomainPCIAddressSetCreate(virDomainDefPtr def, - unsigned int nbuses); + unsigned int nbuses, + bool dryRun); int qemuDomainPCIAddressReserveSlot(qemuDomainPCIAddressSetPtr addrs, virDevicePCIAddressPtr addr); int qemuDomainPCIAddressReserveAddr(qemuDomainPCIAddressSetPtr addrs, -- 1.8.1.5
participants (8)
-
Alex Williamson
-
Daniel P. Berrange
-
Don Dutile
-
Eric Blake
-
Ján Tomko
-
Laine Stump
-
Laine Stump
-
Michael S. Tsirkin