diff options
author | Tomáš Mózes <hydrapolic@gmail.com> | 2024-02-03 19:12:02 +0100 |
---|---|---|
committer | Tomáš Mózes <hydrapolic@gmail.com> | 2024-02-03 19:12:02 +0100 |
commit | 0fbc09bbe820146fd857c79bb150028703342c87 (patch) | |
tree | 94332b5d49d2af5bf62da5afd18776a9b8b87450 | |
parent | Xen 4.17.3-pre-patchset-0 (diff) | |
download | xen-upstream-patches-0fbc09bbe820146fd857c79bb150028703342c87.tar.gz xen-upstream-patches-0fbc09bbe820146fd857c79bb150028703342c87.tar.bz2 xen-upstream-patches-0fbc09bbe820146fd857c79bb150028703342c87.zip |
Xen 4.17.4-pre-patchset-04.17.4-pre-patchset-0
Signed-off-by: Tomáš Mózes <hydrapolic@gmail.com>
65 files changed, 704 insertions, 6120 deletions
diff --git a/0001-update-Xen-version-to-4.17.3-pre.patch b/0001-update-Xen-version-to-4.17.4-pre.patch index 1be1cd1..b532743 100644 --- a/0001-update-Xen-version-to-4.17.3-pre.patch +++ b/0001-update-Xen-version-to-4.17.4-pre.patch @@ -1,25 +1,25 @@ -From 2f337a04bfc2dda794ae0fc108577ec72932f83b Mon Sep 17 00:00:00 2001 +From 4f6e9d4327eb5252f1e8cac97a095d8b8485dadb Mon Sep 17 00:00:00 2001 From: Jan Beulich <jbeulich@suse.com> -Date: Mon, 21 Aug 2023 15:52:13 +0200 -Subject: [PATCH 01/55] update Xen version to 4.17.3-pre +Date: Tue, 30 Jan 2024 14:36:44 +0100 +Subject: [PATCH 01/10] update Xen version to 4.17.4-pre --- xen/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xen/Makefile b/xen/Makefile -index fbada570b8..f6005bd536 100644 +index a46e6330db..dd0b004e1c 100644 --- a/xen/Makefile +++ b/xen/Makefile @@ -6,7 +6,7 @@ this-makefile := $(call lastword,$(MAKEFILE_LIST)) # All other places this is stored (eg. compile.h) should be autogenerated. export XEN_VERSION = 4 export XEN_SUBVERSION = 17 --export XEN_EXTRAVERSION ?= .2$(XEN_VENDORVERSION) -+export XEN_EXTRAVERSION ?= .3-pre$(XEN_VENDORVERSION) +-export XEN_EXTRAVERSION ?= .3$(XEN_VENDORVERSION) ++export XEN_EXTRAVERSION ?= .4-pre$(XEN_VENDORVERSION) export XEN_FULLVERSION = $(XEN_VERSION).$(XEN_SUBVERSION)$(XEN_EXTRAVERSION) -include xen-version -- -2.42.0 +2.43.0 diff --git a/0002-pci-fail-device-assignment-if-phantom-functions-cann.patch b/0002-pci-fail-device-assignment-if-phantom-functions-cann.patch new file mode 100644 index 0000000..d91802f --- /dev/null +++ b/0002-pci-fail-device-assignment-if-phantom-functions-cann.patch @@ -0,0 +1,91 @@ +From f9e1ed51bdba31017ea17e1819eb2ade6b5c8615 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> +Date: Tue, 30 Jan 2024 14:37:39 +0100 +Subject: [PATCH 02/10] pci: fail device assignment if phantom functions cannot + be assigned +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The current behavior is that no error is reported if (some) phantom functions +fail to be assigned during device add or assignment, so the operation succeeds +even if some phantom functions are not correctly setup. + +This can lead to devices possibly being successfully assigned to a domU while +some of the device phantom functions are still assigned to dom0. Even when the +device is assigned domIO before being assigned to a domU phantom functions +might fail to be assigned to domIO, and also fail to be assigned to the domU, +leaving them assigned to dom0. + +Since the device can generate requests using the IDs of those phantom +functions, given the scenario above a device in such state would be in control +of a domU, but still capable of generating transactions that use a context ID +targeting dom0 owned memory. + +Modify device assign in order to attempt to deassign the device if phantom +functions failed to be assigned. + +Note that device addition is not modified in the same way, as in that case the +device is assigned to a trusted domain, and hence partial assign can lead to +device malfunction but not a security issue. + +This is XSA-449 / CVE-2023-46839 + +Fixes: 4e9950dc1bd2 ('IOMMU: add phantom function support') +Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +master commit: cb4ecb3cc17b02c2814bc817efd05f3f3ba33d1e +master date: 2024-01-30 14:28:01 +0100 +--- + xen/drivers/passthrough/pci.c | 27 +++++++++++++++++++++------ + 1 file changed, 21 insertions(+), 6 deletions(-) + +diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c +index 07d1986d33..8c62b14d19 100644 +--- a/xen/drivers/passthrough/pci.c ++++ b/xen/drivers/passthrough/pci.c +@@ -1444,11 +1444,10 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag) + + pdev->fault.count = 0; + +- if ( (rc = iommu_call(hd->platform_ops, assign_device, d, devfn, +- pci_to_dev(pdev), flag)) ) +- goto done; ++ rc = iommu_call(hd->platform_ops, assign_device, d, devfn, pci_to_dev(pdev), ++ flag); + +- for ( ; pdev->phantom_stride; rc = 0 ) ++ while ( pdev->phantom_stride && !rc ) + { + devfn += pdev->phantom_stride; + if ( PCI_SLOT(devfn) != PCI_SLOT(pdev->devfn) ) +@@ -1459,8 +1458,24 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag) + + done: + if ( rc ) +- printk(XENLOG_G_WARNING "%pd: assign (%pp) failed (%d)\n", +- d, &PCI_SBDF(seg, bus, devfn), rc); ++ { ++ printk(XENLOG_G_WARNING "%pd: assign %s(%pp) failed (%d)\n", ++ d, devfn != pdev->devfn ? "phantom function " : "", ++ &PCI_SBDF(seg, bus, devfn), rc); ++ ++ if ( devfn != pdev->devfn && deassign_device(d, seg, bus, pdev->devfn) ) ++ { ++ /* ++ * Device with phantom functions that failed to both assign and ++ * rollback. Mark the device as broken and crash the target domain, ++ * as the state of the functions at this point is unknown and Xen ++ * has no way to assert consistent context assignment among them. ++ */ ++ pdev->broken = true; ++ if ( !is_hardware_domain(d) && d != dom_io ) ++ domain_crash(d); ++ } ++ } + /* The device is assigned to dom_io so mark it as quarantined */ + else if ( d == dom_io ) + pdev->quarantine = true; +-- +2.43.0 + diff --git a/0002-x86-fix-build-with-old-gcc-after-CPU-policy-changes.patch b/0002-x86-fix-build-with-old-gcc-after-CPU-policy-changes.patch deleted file mode 100644 index 1b62572..0000000 --- a/0002-x86-fix-build-with-old-gcc-after-CPU-policy-changes.patch +++ /dev/null @@ -1,84 +0,0 @@ -From 7d8897984927a51495e9a1b827aa4bce1d779b87 Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Mon, 21 Aug 2023 15:53:17 +0200 -Subject: [PATCH 02/55] x86: fix build with old gcc after CPU policy changes - -Old gcc won't cope with initializers involving unnamed struct/union -fields. - -Fixes: 441b1b2a50ea ("x86/emul: Switch x86_emulate_ctxt to cpu_policy") -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> -master commit: 768846690d64bc730c1a1123e8de3af731bb2eb3 -master date: 2023-04-19 11:02:47 +0200 ---- - tools/fuzz/x86_instruction_emulator/fuzz-emul.c | 4 +++- - xen/arch/x86/pv/emul-priv-op.c | 4 +++- - xen/arch/x86/pv/ro-page-fault.c | 4 +++- - 3 files changed, 9 insertions(+), 3 deletions(-) - -diff --git a/tools/fuzz/x86_instruction_emulator/fuzz-emul.c b/tools/fuzz/x86_instruction_emulator/fuzz-emul.c -index 4885a68210..eeeb6931f4 100644 ---- a/tools/fuzz/x86_instruction_emulator/fuzz-emul.c -+++ b/tools/fuzz/x86_instruction_emulator/fuzz-emul.c -@@ -893,12 +893,14 @@ int LLVMFuzzerTestOneInput(const uint8_t *data_p, size_t size) - struct x86_emulate_ctxt ctxt = { - .data = &state, - .regs = &input.regs, -- .cpu_policy = &cp, - .addr_size = 8 * sizeof(void *), - .sp_size = 8 * sizeof(void *), - }; - int rc; - -+ /* Not part of the initializer, for old gcc to cope. */ -+ ctxt.cpu_policy = &cp; -+ - /* Reset all global state variables */ - memset(&input, 0, sizeof(input)); - -diff --git a/xen/arch/x86/pv/emul-priv-op.c b/xen/arch/x86/pv/emul-priv-op.c -index 04416f1979..2c94beb10e 100644 ---- a/xen/arch/x86/pv/emul-priv-op.c -+++ b/xen/arch/x86/pv/emul-priv-op.c -@@ -1327,12 +1327,14 @@ int pv_emulate_privileged_op(struct cpu_user_regs *regs) - struct domain *currd = curr->domain; - struct priv_op_ctxt ctxt = { - .ctxt.regs = regs, -- .ctxt.cpu_policy = currd->arch.cpu_policy, - .ctxt.lma = !is_pv_32bit_domain(currd), - }; - int rc; - unsigned int eflags, ar; - -+ /* Not part of the initializer, for old gcc to cope. */ -+ ctxt.ctxt.cpu_policy = currd->arch.cpu_policy; -+ - if ( !pv_emul_read_descriptor(regs->cs, curr, &ctxt.cs.base, - &ctxt.cs.limit, &ar, 1) || - !(ar & _SEGMENT_S) || -diff --git a/xen/arch/x86/pv/ro-page-fault.c b/xen/arch/x86/pv/ro-page-fault.c -index 0d02c7d2ab..f23ad5d184 100644 ---- a/xen/arch/x86/pv/ro-page-fault.c -+++ b/xen/arch/x86/pv/ro-page-fault.c -@@ -356,7 +356,6 @@ int pv_ro_page_fault(unsigned long addr, struct cpu_user_regs *regs) - unsigned int addr_size = is_pv_32bit_domain(currd) ? 32 : BITS_PER_LONG; - struct x86_emulate_ctxt ctxt = { - .regs = regs, -- .cpu_policy = currd->arch.cpu_policy, - .addr_size = addr_size, - .sp_size = addr_size, - .lma = addr_size > 32, -@@ -364,6 +363,9 @@ int pv_ro_page_fault(unsigned long addr, struct cpu_user_regs *regs) - int rc; - bool mmio_ro; - -+ /* Not part of the initializer, for old gcc to cope. */ -+ ctxt.cpu_policy = currd->arch.cpu_policy; -+ - /* Attempt to read the PTE that maps the VA being accessed. */ - pte = guest_get_eff_kern_l1e(addr); - --- -2.42.0 - diff --git a/0003-VT-d-Fix-else-vs-endif-misplacement.patch b/0003-VT-d-Fix-else-vs-endif-misplacement.patch new file mode 100644 index 0000000..2e7f78d --- /dev/null +++ b/0003-VT-d-Fix-else-vs-endif-misplacement.patch @@ -0,0 +1,70 @@ +From 6b1864afc14d484cdbc9754ce3172ac3dc189846 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper <andrew.cooper3@citrix.com> +Date: Tue, 30 Jan 2024 14:38:38 +0100 +Subject: [PATCH 03/10] VT-d: Fix "else" vs "#endif" misplacement + +In domain_pgd_maddr() the "#endif" is misplaced with respect to "else". This +generates incorrect logic when CONFIG_HVM is compiled out, as the "else" body +is executed unconditionally. + +Rework the logic to use IS_ENABLED() instead of explicit #ifdef-ary, as it's +clearer to follow. This in turn involves adjusting p2m_get_pagetable() to +compile when CONFIG_HVM is disabled. + +This is XSA-450 / CVE-2023-46840. + +Fixes: 033ff90aa9c1 ("x86/P2M: p2m_{alloc,free}_ptp() and p2m_alloc_table() are HVM-only") +Reported-by: Teddy Astie <teddy.astie@vates.tech> +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +master commit: cc6ba68edf6dcd18c3865e7d7c0f1ed822796426 +master date: 2024-01-30 14:29:15 +0100 +--- + xen/arch/x86/include/asm/p2m.h | 9 ++++++++- + xen/drivers/passthrough/vtd/iommu.c | 4 +--- + 2 files changed, 9 insertions(+), 4 deletions(-) + +diff --git a/xen/arch/x86/include/asm/p2m.h b/xen/arch/x86/include/asm/p2m.h +index cd43d8621a..4f691533d5 100644 +--- a/xen/arch/x86/include/asm/p2m.h ++++ b/xen/arch/x86/include/asm/p2m.h +@@ -447,7 +447,14 @@ static inline bool_t p2m_is_altp2m(const struct p2m_domain *p2m) + return p2m->p2m_class == p2m_alternate; + } + +-#define p2m_get_pagetable(p2m) ((p2m)->phys_table) ++#ifdef CONFIG_HVM ++static inline pagetable_t p2m_get_pagetable(const struct p2m_domain *p2m) ++{ ++ return p2m->phys_table; ++} ++#else ++pagetable_t p2m_get_pagetable(const struct p2m_domain *p2m); ++#endif + + /* + * Ensure any deferred p2m TLB flush has been completed on all VCPUs. +diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c +index b4c11a6b48..908b3ba6ee 100644 +--- a/xen/drivers/passthrough/vtd/iommu.c ++++ b/xen/drivers/passthrough/vtd/iommu.c +@@ -441,15 +441,13 @@ static paddr_t domain_pgd_maddr(struct domain *d, paddr_t pgd_maddr, + + if ( pgd_maddr ) + /* nothing */; +-#ifdef CONFIG_HVM +- else if ( iommu_use_hap_pt(d) ) ++ else if ( IS_ENABLED(CONFIG_HVM) && iommu_use_hap_pt(d) ) + { + pagetable_t pgt = p2m_get_pagetable(p2m_get_hostp2m(d)); + + pgd_maddr = pagetable_get_paddr(pgt); + } + else +-#endif + { + if ( !hd->arch.vtd.pgd_maddr ) + { +-- +2.43.0 + diff --git a/0003-libxl-Use-XEN_LIB_DIR-to-store-bootloader-from-pygru.patch b/0003-libxl-Use-XEN_LIB_DIR-to-store-bootloader-from-pygru.patch deleted file mode 100644 index a395d7a..0000000 --- a/0003-libxl-Use-XEN_LIB_DIR-to-store-bootloader-from-pygru.patch +++ /dev/null @@ -1,45 +0,0 @@ -From 8d84be5b557b27e9cc53e48285aebad28a48468c Mon Sep 17 00:00:00 2001 -From: Anthony PERARD <anthony.perard@citrix.com> -Date: Mon, 21 Aug 2023 15:53:47 +0200 -Subject: [PATCH 03/55] libxl: Use XEN_LIB_DIR to store bootloader from pygrub - -In osstest, the jobs using pygrub on arm64 on the branch linux-linus -started to fails with: - [Errno 28] No space left on device - Error writing temporary copy of ramdisk - -This is because /var/run is small when dom0 has only 512MB to work -with, /var/run is only 40MB. The size of both kernel and ramdisk on -this jobs is now about 42MB, so not enough space in /var/run. - -So, to avoid writing a big binary in ramfs, we will use /var/lib -instead, like we already do when saving the device model state on -migration. - -Reported-by: Jan Beulich <jbeulich@suse.com> -Signed-off-by: Anthony PERARD <anthony.perard@citrix.com> -Reviewed-by: Jason Andryuk <jandryuk@gmail.com> -master commit: ad89640ad766d3cb6c92fc8b6406ca6bbab44136 -master date: 2023-08-08 09:45:20 +0200 ---- - tools/libs/light/libxl_bootloader.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/tools/libs/light/libxl_bootloader.c b/tools/libs/light/libxl_bootloader.c -index 1bc6e51827..108329b4a5 100644 ---- a/tools/libs/light/libxl_bootloader.c -+++ b/tools/libs/light/libxl_bootloader.c -@@ -245,8 +245,8 @@ static void bootloader_cleanup(libxl__egc *egc, libxl__bootloader_state *bl) - static void bootloader_setpaths(libxl__gc *gc, libxl__bootloader_state *bl) - { - uint32_t domid = bl->domid; -- bl->outputdir = GCSPRINTF(XEN_RUN_DIR "/bootloader.%"PRIu32".d", domid); -- bl->outputpath = GCSPRINTF(XEN_RUN_DIR "/bootloader.%"PRIu32".out", domid); -+ bl->outputdir = GCSPRINTF(XEN_LIB_DIR "/bootloader.%"PRIu32".d", domid); -+ bl->outputpath = GCSPRINTF(XEN_LIB_DIR "/bootloader.%"PRIu32".out", domid); - } - - /* Callbacks */ --- -2.42.0 - diff --git a/0004-build-define-ARCH-and-SRCARCH-later.patch b/0004-build-define-ARCH-and-SRCARCH-later.patch deleted file mode 100644 index aebcbb7..0000000 --- a/0004-build-define-ARCH-and-SRCARCH-later.patch +++ /dev/null @@ -1,67 +0,0 @@ -From 1c3927f8f6743538a35aa45a91a2d4adbde9f277 Mon Sep 17 00:00:00 2001 -From: Anthony PERARD <anthony.perard@citrix.com> -Date: Wed, 5 Jul 2023 08:25:03 +0200 -Subject: [PATCH 04/55] build: define ARCH and SRCARCH later - -Defining ARCH and SRCARCH later in xen/Makefile allows to switch to -immediate evaluation variable type. - -ARCH and SRCARCH depend on value defined in Config.mk and aren't used -for e.g. TARGET_SUBARCH or TARGET_ARCH, and not before they're needed in -a sub-make or a rule. - -This will help reduce the number of times the shell rune is been -run. - -With GNU make 4.4, the number of execution of the command present in -these $(shell ) increased greatly. This is probably because as of make -4.4, exported variable are also added to the environment of $(shell ) -construct. - -Also, `make -d` shows a lot of these: - Makefile:39: not recursively expanding SRCARCH to export to shell function - Makefile:38: not recursively expanding ARCH to export to shell function - -Reported-by: Jason Andryuk <jandryuk@gmail.com> -Signed-off-by: Anthony PERARD <anthony.perard@citrix.com> -Tested-by: Jason Andryuk <jandryuk@gmail.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit 58e0a3f3b2c430f8640ef9df67ac857b0008ebc8) ---- - xen/Makefile | 13 +++++++------ - 1 file changed, 7 insertions(+), 6 deletions(-) - -diff --git a/xen/Makefile b/xen/Makefile -index f6005bd536..7ecfa6e8e9 100644 ---- a/xen/Makefile -+++ b/xen/Makefile -@@ -35,12 +35,6 @@ MAKEFLAGS += -rR - - EFI_MOUNTPOINT ?= $(BOOT_DIR)/efi - --ARCH=$(XEN_TARGET_ARCH) --SRCARCH=$(shell echo $(ARCH) | \ -- sed -e 's/x86.*/x86/' -e s'/arm\(32\|64\)/arm/g' \ -- -e s'/riscv.*/riscv/g') --export ARCH SRCARCH -- - # Allow someone to change their config file - export KCONFIG_CONFIG ?= .config - -@@ -241,6 +235,13 @@ include scripts/Kbuild.include - include $(XEN_ROOT)/Config.mk - - # Set ARCH/SUBARCH appropriately. -+ -+ARCH := $(XEN_TARGET_ARCH) -+SRCARCH := $(shell echo $(ARCH) | \ -+ sed -e 's/x86.*/x86/' -e 's/arm\(32\|64\)/arm/g' \ -+ -e 's/riscv.*/riscv/g') -+export ARCH SRCARCH -+ - export TARGET_SUBARCH := $(XEN_TARGET_ARCH) - export TARGET_ARCH := $(shell echo $(XEN_TARGET_ARCH) | \ - sed -e 's/x86.*/x86/' -e s'/arm\(32\|64\)/arm/g' \ --- -2.42.0 - diff --git a/0004-x86-amd-Extend-CPU-erratum-1474-fix-to-more-affected.patch b/0004-x86-amd-Extend-CPU-erratum-1474-fix-to-more-affected.patch new file mode 100644 index 0000000..f1289aa --- /dev/null +++ b/0004-x86-amd-Extend-CPU-erratum-1474-fix-to-more-affected.patch @@ -0,0 +1,123 @@ +From abcc32f0634627fe21117a48bd10e792bfbdd6dc Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> +Date: Fri, 2 Feb 2024 08:01:09 +0100 +Subject: [PATCH 04/10] x86/amd: Extend CPU erratum #1474 fix to more affected + models +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Erratum #1474 has now been extended to cover models from family 17h ranges +00-2Fh, so the errata now covers all the models released under Family +17h (Zen, Zen+ and Zen2). + +Additionally extend the workaround to Family 18h (Hygon), since it's based on +the Zen architecture and very likely affected. + +Rename all the zen2 related symbols to fam17, since the errata doesn't +exclusively affect Zen2 anymore. + +Reported-by: Andrew Cooper <andrew.cooper3@citrix.com> +Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> +Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> +master commit: 23db507a01a4ec5259ec0ab43d296a41b1c326ba +master date: 2023-12-21 12:19:40 +0000 +--- + xen/arch/x86/cpu/amd.c | 27 ++++++++++++++------------- + 1 file changed, 14 insertions(+), 13 deletions(-) + +diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c +index 29ae97e7c0..3d85e9797d 100644 +--- a/xen/arch/x86/cpu/amd.c ++++ b/xen/arch/x86/cpu/amd.c +@@ -54,7 +54,7 @@ bool __read_mostly amd_acpi_c1e_quirk; + bool __ro_after_init amd_legacy_ssbd; + bool __initdata amd_virt_spec_ctrl; + +-static bool __read_mostly zen2_c6_disabled; ++static bool __read_mostly fam17_c6_disabled; + + static inline int rdmsr_amd_safe(unsigned int msr, unsigned int *lo, + unsigned int *hi) +@@ -951,24 +951,24 @@ void amd_check_zenbleed(void) + val & chickenbit ? "chickenbit" : "microcode"); + } + +-static void cf_check zen2_disable_c6(void *arg) ++static void cf_check fam17_disable_c6(void *arg) + { + /* Disable C6 by clearing the CCR{0,1,2}_CC6EN bits. */ + const uint64_t mask = ~((1ul << 6) | (1ul << 14) | (1ul << 22)); + uint64_t val; + +- if (!zen2_c6_disabled) { ++ if (!fam17_c6_disabled) { + printk(XENLOG_WARNING + "Disabling C6 after 1000 days apparent uptime due to AMD errata 1474\n"); +- zen2_c6_disabled = true; ++ fam17_c6_disabled = true; + /* + * Prevent CPU hotplug so that started CPUs will either see +- * zen2_c6_disabled set, or will be handled by ++ * zen_c6_disabled set, or will be handled by + * smp_call_function(). + */ + while (!get_cpu_maps()) + process_pending_softirqs(); +- smp_call_function(zen2_disable_c6, NULL, 0); ++ smp_call_function(fam17_disable_c6, NULL, 0); + put_cpu_maps(); + } + +@@ -1273,8 +1273,8 @@ static void cf_check init_amd(struct cpuinfo_x86 *c) + amd_check_zenbleed(); + amd_check_erratum_1485(); + +- if (zen2_c6_disabled) +- zen2_disable_c6(NULL); ++ if (fam17_c6_disabled) ++ fam17_disable_c6(NULL); + + check_syscfg_dram_mod_en(); + +@@ -1286,7 +1286,7 @@ const struct cpu_dev amd_cpu_dev = { + .c_init = init_amd, + }; + +-static int __init cf_check zen2_c6_errata_check(void) ++static int __init cf_check amd_check_erratum_1474(void) + { + /* + * Errata #1474: A Core May Hang After About 1044 Days +@@ -1294,7 +1294,8 @@ static int __init cf_check zen2_c6_errata_check(void) + */ + s_time_t delta; + +- if (cpu_has_hypervisor || boot_cpu_data.x86 != 0x17 || !is_zen2_uarch()) ++ if (cpu_has_hypervisor || ++ (boot_cpu_data.x86 != 0x17 && boot_cpu_data.x86 != 0x18)) + return 0; + + /* +@@ -1309,10 +1310,10 @@ static int __init cf_check zen2_c6_errata_check(void) + if (delta > 0) { + static struct timer errata_c6; + +- init_timer(&errata_c6, zen2_disable_c6, NULL, 0); ++ init_timer(&errata_c6, fam17_disable_c6, NULL, 0); + set_timer(&errata_c6, NOW() + delta); + } else +- zen2_disable_c6(NULL); ++ fam17_disable_c6(NULL); + + return 0; + } +@@ -1320,4 +1321,4 @@ static int __init cf_check zen2_c6_errata_check(void) + * Must be executed after early_time_init() for tsc_ticks2ns() to have been + * calibrated. That prevents us doing the check in init_amd(). + */ +-presmp_initcall(zen2_c6_errata_check); ++presmp_initcall(amd_check_erratum_1474); +-- +2.43.0 + diff --git a/0005-CirrusCI-drop-FreeBSD-12.patch b/0005-CirrusCI-drop-FreeBSD-12.patch new file mode 100644 index 0000000..cca7bb0 --- /dev/null +++ b/0005-CirrusCI-drop-FreeBSD-12.patch @@ -0,0 +1,39 @@ +From 0ef1fb43ddd61b3c4c953e833e012ac21ad5ca0f Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> +Date: Fri, 2 Feb 2024 08:01:50 +0100 +Subject: [PATCH 05/10] CirrusCI: drop FreeBSD 12 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Went EOL by the end of December 2023, and the pkg repos have been shut down. + +Reported-by: Andrew Cooper <andrew.cooper3@citrix.com> +Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> +Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> +master commit: c2ce3466472e9c9eda79f5dc98eb701bc6fdba20 +master date: 2024-01-15 12:20:11 +0100 +--- + .cirrus.yml | 6 ------ + 1 file changed, 6 deletions(-) + +diff --git a/.cirrus.yml b/.cirrus.yml +index 7e0beb200d..63f3afb104 100644 +--- a/.cirrus.yml ++++ b/.cirrus.yml +@@ -14,12 +14,6 @@ freebsd_template: &FREEBSD_TEMPLATE + - ./configure --with-system-seabios=/usr/local/share/seabios/bios.bin + - gmake -j`sysctl -n hw.ncpu` clang=y + +-task: +- name: 'FreeBSD 12' +- freebsd_instance: +- image_family: freebsd-12-4 +- << : *FREEBSD_TEMPLATE +- + task: + name: 'FreeBSD 13' + freebsd_instance: +-- +2.43.0 + diff --git a/0005-build-remove-TARGET_SUBARCH-a-duplicate-of-ARCH.patch b/0005-build-remove-TARGET_SUBARCH-a-duplicate-of-ARCH.patch deleted file mode 100644 index 4f31614..0000000 --- a/0005-build-remove-TARGET_SUBARCH-a-duplicate-of-ARCH.patch +++ /dev/null @@ -1,50 +0,0 @@ -From 56076ef445073458c39c481f9b70c3b4ff848839 Mon Sep 17 00:00:00 2001 -From: Anthony PERARD <anthony.perard@citrix.com> -Date: Wed, 5 Jul 2023 08:27:51 +0200 -Subject: [PATCH 05/55] build: remove TARGET_SUBARCH, a duplicate of ARCH - -Signed-off-by: Anthony PERARD <anthony.perard@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit a6ab7dd061338c33faef629cbe52ed1608571d84) ---- - xen/Makefile | 3 +-- - xen/build.mk | 2 +- - 2 files changed, 2 insertions(+), 3 deletions(-) - -diff --git a/xen/Makefile b/xen/Makefile -index 7ecfa6e8e9..6e89bcf348 100644 ---- a/xen/Makefile -+++ b/xen/Makefile -@@ -234,7 +234,7 @@ include scripts/Kbuild.include - # we need XEN_TARGET_ARCH to generate the proper config - include $(XEN_ROOT)/Config.mk - --# Set ARCH/SUBARCH appropriately. -+# Set ARCH/SRCARCH appropriately. - - ARCH := $(XEN_TARGET_ARCH) - SRCARCH := $(shell echo $(ARCH) | \ -@@ -242,7 +242,6 @@ SRCARCH := $(shell echo $(ARCH) | \ - -e 's/riscv.*/riscv/g') - export ARCH SRCARCH - --export TARGET_SUBARCH := $(XEN_TARGET_ARCH) - export TARGET_ARCH := $(shell echo $(XEN_TARGET_ARCH) | \ - sed -e 's/x86.*/x86/' -e s'/arm\(32\|64\)/arm/g' \ - -e s'/riscv.*/riscv/g') -diff --git a/xen/build.mk b/xen/build.mk -index 758590c68e..d049d3a53a 100644 ---- a/xen/build.mk -+++ b/xen/build.mk -@@ -41,7 +41,7 @@ include/xen/compile.h: include/xen/compile.h.in .banner FORCE - targets += include/xen/compile.h - - -include $(wildcard .asm-offsets.s.d) --asm-offsets.s: arch/$(TARGET_ARCH)/$(TARGET_SUBARCH)/asm-offsets.c -+asm-offsets.s: arch/$(TARGET_ARCH)/$(ARCH)/asm-offsets.c - $(CC) $(call cpp_flags,$(c_flags)) -S -g0 -o $@.new -MQ $@ $< - $(call move-if-changed,$@.new,$@) - --- -2.42.0 - diff --git a/0006-build-remove-TARGET_ARCH-a-duplicate-of-SRCARCH.patch b/0006-build-remove-TARGET_ARCH-a-duplicate-of-SRCARCH.patch deleted file mode 100644 index 9eef37a..0000000 --- a/0006-build-remove-TARGET_ARCH-a-duplicate-of-SRCARCH.patch +++ /dev/null @@ -1,123 +0,0 @@ -From 36e84ea02e1e8dce8f3a4e9351ab1c72dec3c11e Mon Sep 17 00:00:00 2001 -From: Anthony PERARD <anthony.perard@citrix.com> -Date: Wed, 5 Jul 2023 08:29:49 +0200 -Subject: [PATCH 06/55] build: remove TARGET_ARCH, a duplicate of SRCARCH - -The same command is used to generate the value of both $(TARGET_ARCH) -and $(SRCARCH), as $(ARCH) is an alias for $(XEN_TARGET_ARCH). - -Signed-off-by: Anthony PERARD <anthony.perard@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit ac27b3beb9b7b423d5563768de890c7594c21b4e) ---- - xen/Makefile | 20 ++++++++------------ - xen/Rules.mk | 2 +- - xen/build.mk | 6 +++--- - 3 files changed, 12 insertions(+), 16 deletions(-) - -diff --git a/xen/Makefile b/xen/Makefile -index 6e89bcf348..1a3b9a081f 100644 ---- a/xen/Makefile -+++ b/xen/Makefile -@@ -242,10 +242,6 @@ SRCARCH := $(shell echo $(ARCH) | \ - -e 's/riscv.*/riscv/g') - export ARCH SRCARCH - --export TARGET_ARCH := $(shell echo $(XEN_TARGET_ARCH) | \ -- sed -e 's/x86.*/x86/' -e s'/arm\(32\|64\)/arm/g' \ -- -e s'/riscv.*/riscv/g') -- - export CONFIG_SHELL := $(SHELL) - export CC CXX LD NM OBJCOPY OBJDUMP ADDR2LINE - export YACC = $(if $(BISON),$(BISON),bison) -@@ -262,7 +258,7 @@ export XEN_TREEWIDE_CFLAGS := $(CFLAGS) - ifneq ($(shell $(CC) --version 2>&1 | head -n 1 | grep clang),) - CLANG_FLAGS := - --ifeq ($(TARGET_ARCH),x86) -+ifeq ($(SRCARCH),x86) - # The tests to select whether the integrated assembler is usable need to happen - # before testing any assembler features, or else the result of the tests would - # be stale if the integrated assembler is not used. -@@ -430,22 +426,22 @@ endif - - ifdef building_out_of_srctree - CFLAGS += -I$(objtree)/include -- CFLAGS += -I$(objtree)/arch/$(TARGET_ARCH)/include -+ CFLAGS += -I$(objtree)/arch/$(SRCARCH)/include - endif - CFLAGS += -I$(srctree)/include --CFLAGS += -I$(srctree)/arch/$(TARGET_ARCH)/include -+CFLAGS += -I$(srctree)/arch/$(SRCARCH)/include - - # Note that link order matters! - ALL_OBJS-y := common/built_in.o - ALL_OBJS-y += drivers/built_in.o - ALL_OBJS-y += lib/built_in.o - ALL_OBJS-y += xsm/built_in.o --ALL_OBJS-y += arch/$(TARGET_ARCH)/built_in.o -+ALL_OBJS-y += arch/$(SRCARCH)/built_in.o - ALL_OBJS-$(CONFIG_CRYPTO) += crypto/built_in.o - - ALL_LIBS-y := lib/lib.a - --include $(srctree)/arch/$(TARGET_ARCH)/arch.mk -+include $(srctree)/arch/$(SRCARCH)/arch.mk - - # define new variables to avoid the ones defined in Config.mk - export XEN_CFLAGS := $(CFLAGS) -@@ -587,11 +583,11 @@ $(TARGET): outputmakefile FORCE - $(Q)$(MAKE) $(build)=tools - $(Q)$(MAKE) $(build)=. include/xen/compile.h - $(Q)$(MAKE) $(build)=include all -- $(Q)$(MAKE) $(build)=arch/$(TARGET_ARCH) include -- $(Q)$(MAKE) $(build)=. arch/$(TARGET_ARCH)/include/asm/asm-offsets.h -+ $(Q)$(MAKE) $(build)=arch/$(SRCARCH) include -+ $(Q)$(MAKE) $(build)=. arch/$(SRCARCH)/include/asm/asm-offsets.h - $(Q)$(MAKE) $(build)=. MKRELOC=$(MKRELOC) 'ALL_OBJS=$(ALL_OBJS-y)' 'ALL_LIBS=$(ALL_LIBS-y)' $@ - --SUBDIRS = xsm arch/$(TARGET_ARCH) common drivers lib test -+SUBDIRS = xsm arch/$(SRCARCH) common drivers lib test - define all_sources - ( find include -type f -name '*.h' -print; \ - find $(SUBDIRS) -type f -name '*.[chS]' -print ) -diff --git a/xen/Rules.mk b/xen/Rules.mk -index 59072ae8df..8af3dd7277 100644 ---- a/xen/Rules.mk -+++ b/xen/Rules.mk -@@ -180,7 +180,7 @@ cpp_flags = $(filter-out -Wa$(comma)% -flto,$(1)) - c_flags = -MMD -MP -MF $(depfile) $(XEN_CFLAGS) - a_flags = -MMD -MP -MF $(depfile) $(XEN_AFLAGS) - --include $(srctree)/arch/$(TARGET_ARCH)/Rules.mk -+include $(srctree)/arch/$(SRCARCH)/Rules.mk - - c_flags += $(_c_flags) - a_flags += $(_c_flags) -diff --git a/xen/build.mk b/xen/build.mk -index d049d3a53a..9ecb104f1e 100644 ---- a/xen/build.mk -+++ b/xen/build.mk -@@ -41,11 +41,11 @@ include/xen/compile.h: include/xen/compile.h.in .banner FORCE - targets += include/xen/compile.h - - -include $(wildcard .asm-offsets.s.d) --asm-offsets.s: arch/$(TARGET_ARCH)/$(ARCH)/asm-offsets.c -+asm-offsets.s: arch/$(SRCARCH)/$(ARCH)/asm-offsets.c - $(CC) $(call cpp_flags,$(c_flags)) -S -g0 -o $@.new -MQ $@ $< - $(call move-if-changed,$@.new,$@) - --arch/$(TARGET_ARCH)/include/asm/asm-offsets.h: asm-offsets.s -+arch/$(SRCARCH)/include/asm/asm-offsets.h: asm-offsets.s - @(set -e; \ - echo "/*"; \ - echo " * DO NOT MODIFY."; \ -@@ -87,4 +87,4 @@ endif - targets += prelink.o - - $(TARGET): prelink.o FORCE -- $(Q)$(MAKE) $(build)=arch/$(TARGET_ARCH) $@ -+ $(Q)$(MAKE) $(build)=arch/$(SRCARCH) $@ --- -2.42.0 - diff --git a/0006-x86-intel-ensure-Global-Performance-Counter-Control-.patch b/0006-x86-intel-ensure-Global-Performance-Counter-Control-.patch new file mode 100644 index 0000000..dc64ad6 --- /dev/null +++ b/0006-x86-intel-ensure-Global-Performance-Counter-Control-.patch @@ -0,0 +1,74 @@ +From d0ad2cc5eac1b5d3cfd14204d377ce2384f52607 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> +Date: Fri, 2 Feb 2024 08:02:20 +0100 +Subject: [PATCH 06/10] x86/intel: ensure Global Performance Counter Control is + setup correctly +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +When Architectural Performance Monitoring is available, the PERF_GLOBAL_CTRL +MSR contains per-counter enable bits that is ANDed with the enable bit in the +counter EVNTSEL MSR in order for a PMC counter to be enabled. + +So far the watchdog code seems to have relied on the PERF_GLOBAL_CTRL enable +bits being set by default, but at least on some Intel Sapphire and Emerald +Rapids this is no longer the case, and Xen reports: + +Testing NMI watchdog on all CPUs: 0 40 stuck + +The first CPU on each package is started with PERF_GLOBAL_CTRL zeroed, so PMC0 +doesn't start counting when the enable bit in EVNTSEL0 is set, due to the +relevant enable bit in PERF_GLOBAL_CTRL not being set. + +Check and adjust PERF_GLOBAL_CTRL during CPU initialization so that all the +general-purpose PMCs are enabled. Doing so brings the state of the package-BSP +PERF_GLOBAL_CTRL in line with the rest of the CPUs on the system. + +Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> +Acked-by: Jan Beulich <jbeulich@suse.com> +master commit: 6bdb965178bbb3fc50cd4418d4770a7789956e2c +master date: 2024-01-17 10:40:52 +0100 +--- + xen/arch/x86/cpu/intel.c | 23 ++++++++++++++++++++++- + 1 file changed, 22 insertions(+), 1 deletion(-) + +diff --git a/xen/arch/x86/cpu/intel.c b/xen/arch/x86/cpu/intel.c +index b40ac696e6..96723b5d44 100644 +--- a/xen/arch/x86/cpu/intel.c ++++ b/xen/arch/x86/cpu/intel.c +@@ -528,9 +528,30 @@ static void cf_check init_intel(struct cpuinfo_x86 *c) + init_intel_cacheinfo(c); + if (c->cpuid_level > 9) { + unsigned eax = cpuid_eax(10); ++ unsigned int cnt = (eax >> 8) & 0xff; ++ + /* Check for version and the number of counters */ +- if ((eax & 0xff) && (((eax>>8) & 0xff) > 1)) ++ if ((eax & 0xff) && (cnt > 1) && (cnt <= 32)) { ++ uint64_t global_ctrl; ++ unsigned int cnt_mask = (1UL << cnt) - 1; ++ ++ /* ++ * On (some?) Sapphire/Emerald Rapids platforms each ++ * package-BSP starts with all the enable bits for the ++ * general-purpose PMCs cleared. Adjust so counters ++ * can be enabled from EVNTSEL. ++ */ ++ rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, global_ctrl); ++ if ((global_ctrl & cnt_mask) != cnt_mask) { ++ printk("CPU%u: invalid PERF_GLOBAL_CTRL: %#" ++ PRIx64 " adjusting to %#" PRIx64 "\n", ++ smp_processor_id(), global_ctrl, ++ global_ctrl | cnt_mask); ++ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ++ global_ctrl | cnt_mask); ++ } + __set_bit(X86_FEATURE_ARCH_PERFMON, c->x86_capability); ++ } + } + + if ( !cpu_has(c, X86_FEATURE_XTOPOLOGY) ) +-- +2.43.0 + diff --git a/0007-build-evaluate-XEN_BUILD_-and-XEN_DOMAIN-immediately.patch b/0007-build-evaluate-XEN_BUILD_-and-XEN_DOMAIN-immediately.patch deleted file mode 100644 index 81e5ca4..0000000 --- a/0007-build-evaluate-XEN_BUILD_-and-XEN_DOMAIN-immediately.patch +++ /dev/null @@ -1,58 +0,0 @@ -From a1f68fb56710c507f9c1ec8e8d784f5b1e4088f1 Mon Sep 17 00:00:00 2001 -From: Anthony PERARD <anthony.perard@citrix.com> -Date: Mon, 31 Jul 2023 15:02:18 +0200 -Subject: [PATCH 07/55] build: evaluate XEN_BUILD_* and XEN_DOMAIN immediately - -With GNU make 4.4, the number of execution of the command present in -these $(shell ) increased greatly. This is probably because as of make -4.4, exported variable are also added to the environment of $(shell ) -construct. - -Also, `make -d` shows a lot of these: - Makefile:15: not recursively expanding XEN_BUILD_DATE to export to shell function - Makefile:16: not recursively expanding XEN_BUILD_TIME to export to shell function - Makefile:17: not recursively expanding XEN_BUILD_HOST to export to shell function - Makefile:14: not recursively expanding XEN_DOMAIN to export to shell function - -So to avoid having these command been run more than necessary, we -will replace ?= by an equivalent but with immediate expansion. - -Reported-by: Jason Andryuk <jandryuk@gmail.com> -Signed-off-by: Anthony PERARD <anthony.perard@citrix.com> -Tested-by: Jason Andryuk <jandryuk@gmail.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit 0c594c1b57ee2ecec5f70826c53a2cf02a9c2acb) ---- - xen/Makefile | 16 ++++++++++++---- - 1 file changed, 12 insertions(+), 4 deletions(-) - -diff --git a/xen/Makefile b/xen/Makefile -index 1a3b9a081f..7bb9de7bdc 100644 ---- a/xen/Makefile -+++ b/xen/Makefile -@@ -11,10 +11,18 @@ export XEN_FULLVERSION = $(XEN_VERSION).$(XEN_SUBVERSION)$(XEN_EXTRAVERSION) - -include xen-version - - export XEN_WHOAMI ?= $(USER) --export XEN_DOMAIN ?= $(shell ([ -x /bin/dnsdomainname ] && /bin/dnsdomainname) || ([ -x /bin/domainname ] && /bin/domainname || echo [unknown])) --export XEN_BUILD_DATE ?= $(shell LC_ALL=C date) --export XEN_BUILD_TIME ?= $(shell LC_ALL=C date +%T) --export XEN_BUILD_HOST ?= $(shell hostname) -+ifeq ($(origin XEN_DOMAIN), undefined) -+export XEN_DOMAIN := $(shell ([ -x /bin/dnsdomainname ] && /bin/dnsdomainname) || ([ -x /bin/domainname ] && /bin/domainname || echo [unknown])) -+endif -+ifeq ($(origin XEN_BUILD_DATE), undefined) -+export XEN_BUILD_DATE := $(shell LC_ALL=C date) -+endif -+ifeq ($(origin XEN_BUILD_TIME), undefined) -+export XEN_BUILD_TIME := $(shell LC_ALL=C date +%T) -+endif -+ifeq ($(origin XEN_BUILD_HOST), undefined) -+export XEN_BUILD_HOST := $(shell hostname) -+endif - - # Best effort attempt to find a python interpreter, defaulting to Python 3 if - # available. Fall back to just `python` if `which` is nowhere to be found. --- -2.42.0 - diff --git a/0007-x86-vmx-Fix-IRQ-handling-for-EXIT_REASON_INIT.patch b/0007-x86-vmx-Fix-IRQ-handling-for-EXIT_REASON_INIT.patch new file mode 100644 index 0000000..a1937a7 --- /dev/null +++ b/0007-x86-vmx-Fix-IRQ-handling-for-EXIT_REASON_INIT.patch @@ -0,0 +1,65 @@ +From eca5416f9b0e179de9553900de8de660ab09199d Mon Sep 17 00:00:00 2001 +From: Andrew Cooper <andrew.cooper3@citrix.com> +Date: Fri, 2 Feb 2024 08:02:51 +0100 +Subject: [PATCH 07/10] x86/vmx: Fix IRQ handling for EXIT_REASON_INIT + +When receiving an INIT, a prior bugfix tried to ignore the INIT and continue +onwards. + +Unfortunately it's not safe to return at that point in vmx_vmexit_handler(). +Just out of context in the first hunk is a local_irqs_enabled() which is +depended-upon by the return-to-guest path, causing the following checklock +failure in debug builds: + + (XEN) Error: INIT received - ignoring + (XEN) CHECKLOCK FAILURE: prev irqsafe: 0, curr irqsafe 1 + (XEN) Xen BUG at common/spinlock.c:132 + (XEN) ----[ Xen-4.19-unstable x86_64 debug=y Tainted: H ]---- + ... + (XEN) Xen call trace: + (XEN) [<ffff82d040238e10>] R check_lock+0xcd/0xe1 + (XEN) [<ffff82d040238fe3>] F _spin_lock+0x1b/0x60 + (XEN) [<ffff82d0402ed6a8>] F pt_update_irq+0x32/0x3bb + (XEN) [<ffff82d0402b9632>] F vmx_intr_assist+0x3b/0x51d + (XEN) [<ffff82d040206447>] F vmx_asm_vmexit_handler+0xf7/0x210 + +Luckily, this is benign in release builds. Accidentally having IRQs disabled +when trying to take an IRQs-on lock isn't a deadlock-vulnerable pattern. + +Drop the problematic early return. In hindsight, it's wrong to skip other +normal VMExit steps. + +Fixes: b1f11273d5a7 ("x86/vmx: Don't spuriously crash the domain when INIT is received") +Reported-by: Reima ISHII <ishiir@g.ecc.u-tokyo.ac.jp> +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +master commit: d1f8883aebe00f6a9632d77ab0cd5c6d02c9cbe4 +master date: 2024-01-18 20:59:06 +0000 +--- + xen/arch/x86/hvm/vmx/vmx.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c +index 072288a5ef..31f4a861c6 100644 +--- a/xen/arch/x86/hvm/vmx/vmx.c ++++ b/xen/arch/x86/hvm/vmx/vmx.c +@@ -4037,7 +4037,7 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs) + + case EXIT_REASON_INIT: + printk(XENLOG_ERR "Error: INIT received - ignoring\n"); +- return; /* Renter the guest without further processing */ ++ break; + } + + /* Now enable interrupts so it's safe to take locks. */ +@@ -4323,6 +4323,7 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs) + break; + } + case EXIT_REASON_EXTERNAL_INTERRUPT: ++ case EXIT_REASON_INIT: + /* Already handled above. */ + break; + case EXIT_REASON_TRIPLE_FAULT: +-- +2.43.0 + diff --git a/0008-Config.mk-evaluate-XEN_COMPILE_ARCH-and-XEN_OS-immed.patch b/0008-Config.mk-evaluate-XEN_COMPILE_ARCH-and-XEN_OS-immed.patch deleted file mode 100644 index 8a4cb7d..0000000 --- a/0008-Config.mk-evaluate-XEN_COMPILE_ARCH-and-XEN_OS-immed.patch +++ /dev/null @@ -1,50 +0,0 @@ -From 476d2624ec3cf3e60709580ff1df208bb8f616e2 Mon Sep 17 00:00:00 2001 -From: Anthony PERARD <anthony.perard@citrix.com> -Date: Mon, 31 Jul 2023 15:02:34 +0200 -Subject: [PATCH 08/55] Config.mk: evaluate XEN_COMPILE_ARCH and XEN_OS - immediately - -With GNU make 4.4, the number of execution of the command present in -these $(shell ) increased greatly. This is probably because as of make -4.4, exported variable are also added to the environment of $(shell ) -construct. - -So to avoid having these command been run more than necessary, we -will replace ?= by an equivalent but with immediate expansion. - -Reported-by: Jason Andryuk <jandryuk@gmail.com> -Signed-off-by: Anthony PERARD <anthony.perard@citrix.com> -Tested-by: Jason Andryuk <jandryuk@gmail.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit a07414d989cf52e5e84192b78023bee1589bbda4) ---- - Config.mk | 8 ++++++-- - 1 file changed, 6 insertions(+), 2 deletions(-) - -diff --git a/Config.mk b/Config.mk -index 8bc2bcd5f6..4864033c73 100644 ---- a/Config.mk -+++ b/Config.mk -@@ -19,13 +19,17 @@ or = $(if $(strip $(1)),$(1),$(if $(strip $(2)),$(2),$(if $(strip $(3)),$( - - -include $(XEN_ROOT)/.config - --XEN_COMPILE_ARCH ?= $(shell uname -m | sed -e s/i.86/x86_32/ \ -+ifeq ($(origin XEN_COMPILE_ARCH), undefined) -+XEN_COMPILE_ARCH := $(shell uname -m | sed -e s/i.86/x86_32/ \ - -e s/i86pc/x86_32/ -e s/amd64/x86_64/ \ - -e s/armv7.*/arm32/ -e s/armv8.*/arm64/ \ - -e s/aarch64/arm64/) -+endif - - XEN_TARGET_ARCH ?= $(XEN_COMPILE_ARCH) --XEN_OS ?= $(shell uname -s) -+ifeq ($(origin XEN_OS), undefined) -+XEN_OS := $(shell uname -s) -+endif - - CONFIG_$(XEN_OS) := y - --- -2.42.0 - diff --git a/0008-x86-vmx-Disallow-the-use-of-inactivity-states.patch b/0008-x86-vmx-Disallow-the-use-of-inactivity-states.patch new file mode 100644 index 0000000..12c2d59 --- /dev/null +++ b/0008-x86-vmx-Disallow-the-use-of-inactivity-states.patch @@ -0,0 +1,126 @@ +From 7bd612727df792671e44152a8205f0cf821ad984 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper <andrew.cooper3@citrix.com> +Date: Fri, 2 Feb 2024 08:03:26 +0100 +Subject: [PATCH 08/10] x86/vmx: Disallow the use of inactivity states + +Right now, vvmx will blindly copy L12's ACTIVITY_STATE into the L02 VMCS and +enter the vCPU. Luckily for us, nested-virt is explicitly unsupported for +security bugs. + +The inactivity states are HLT, SHUTDOWN and WAIT-FOR-SIPI, and as noted by the +SDM in Vol3 27.7 "Special Features of VM Entry": + + If VM entry ends with the logical processor in an inactive activity state, + the VM entry generates any special bus cycle that is normally generated when + that activity state is entered from the active state. + +Also, + + Some activity states unconditionally block certain events. + +I.e. A VMEntry with ACTIVITY=SHUTDOWN will initiate a platform reset, while a +VMEntry with ACTIVITY=WAIT-FOR-SIPI will really block everything other than +SIPIs. + +Both of these activity states are for the TXT ACM to use, not for regular +hypervisors, and Xen doesn't support dropping the HLT intercept either. + +There are two paths in Xen which operate on ACTIVITY_STATE. + +1) The vmx_{get,set}_nonreg_state() helpers for VM-Fork. + + As regular VMs can't use any inactivity states, this is just duplicating + the 0 from construct_vmcs(). Retain the ability to query activity_state, + but crash the domain on any attempt to set an inactivity state. + +2) Nested virt, because of ACTIVITY_STATE in vmcs_gstate_field[]. + + Explicitly hide the inactivity states in the guest's view of MSR_VMX_MISC, + and remove ACTIVITY_STATE from vmcs_gstate_field[]. + + In virtual_vmentry(), we should trigger a VMEntry failure for the use of + any inactivity states, but there's no support for that in the code at all + so leave a TODO for when we finally start working on nested-virt in + earnest. + +Reported-by: Reima Ishii <ishiir@g.ecc.u-tokyo.ac.jp> +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Tamas K Lengyel <tamas@tklengyel.com> +master commit: 3643bb53a05b7c8fbac072c63bef1538f2a6d0d2 +master date: 2024-01-18 20:59:06 +0000 +--- + xen/arch/x86/hvm/vmx/vmx.c | 8 +++++++- + xen/arch/x86/hvm/vmx/vvmx.c | 9 +++++++-- + xen/arch/x86/include/asm/hvm/vmx/vmcs.h | 1 + + 3 files changed, 15 insertions(+), 3 deletions(-) + +diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c +index 31f4a861c6..35d391d8e5 100644 +--- a/xen/arch/x86/hvm/vmx/vmx.c ++++ b/xen/arch/x86/hvm/vmx/vmx.c +@@ -1499,7 +1499,13 @@ static void cf_check vmx_set_nonreg_state(struct vcpu *v, + { + vmx_vmcs_enter(v); + +- __vmwrite(GUEST_ACTIVITY_STATE, nrs->vmx.activity_state); ++ if ( nrs->vmx.activity_state ) ++ { ++ printk("Attempt to set %pv activity_state %#lx\n", ++ v, nrs->vmx.activity_state); ++ domain_crash(v->domain); ++ } ++ + __vmwrite(GUEST_INTERRUPTIBILITY_INFO, nrs->vmx.interruptibility_info); + __vmwrite(GUEST_PENDING_DBG_EXCEPTIONS, nrs->vmx.pending_dbg); + +diff --git a/xen/arch/x86/hvm/vmx/vvmx.c b/xen/arch/x86/hvm/vmx/vvmx.c +index f8fe8d0c14..515cb5ae77 100644 +--- a/xen/arch/x86/hvm/vmx/vvmx.c ++++ b/xen/arch/x86/hvm/vmx/vvmx.c +@@ -910,7 +910,10 @@ static const u16 vmcs_gstate_field[] = { + GUEST_LDTR_AR_BYTES, + GUEST_TR_AR_BYTES, + GUEST_INTERRUPTIBILITY_INFO, ++ /* ++ * ACTIVITY_STATE is handled specially. + GUEST_ACTIVITY_STATE, ++ */ + GUEST_SYSENTER_CS, + GUEST_PREEMPTION_TIMER, + /* natural */ +@@ -1211,6 +1214,8 @@ static void virtual_vmentry(struct cpu_user_regs *regs) + nvcpu->nv_vmentry_pending = 0; + nvcpu->nv_vmswitch_in_progress = 1; + ++ /* TODO: Fail VMentry for GUEST_ACTIVITY_STATE != 0 */ ++ + /* + * EFER handling: + * hvm_set_efer won't work if CR0.PG = 1, so we change the value +@@ -2327,8 +2332,8 @@ int nvmx_msr_read_intercept(unsigned int msr, u64 *msr_content) + data = hvm_cr4_guest_valid_bits(d); + break; + case MSR_IA32_VMX_MISC: +- /* Do not support CR3-target feature now */ +- data = host_data & ~VMX_MISC_CR3_TARGET; ++ /* Do not support CR3-targets or activity states. */ ++ data = host_data & ~(VMX_MISC_CR3_TARGET | VMX_MISC_ACTIVITY_MASK); + break; + case MSR_IA32_VMX_EPT_VPID_CAP: + data = nept_get_ept_vpid_cap(); +diff --git a/xen/arch/x86/include/asm/hvm/vmx/vmcs.h b/xen/arch/x86/include/asm/hvm/vmx/vmcs.h +index 78404e42b3..0af021d5f5 100644 +--- a/xen/arch/x86/include/asm/hvm/vmx/vmcs.h ++++ b/xen/arch/x86/include/asm/hvm/vmx/vmcs.h +@@ -288,6 +288,7 @@ extern u32 vmx_secondary_exec_control; + #define VMX_VPID_INVVPID_SINGLE_CONTEXT_RETAINING_GLOBAL 0x80000000000ULL + extern u64 vmx_ept_vpid_cap; + ++#define VMX_MISC_ACTIVITY_MASK 0x000001c0 + #define VMX_MISC_PROC_TRACE 0x00004000 + #define VMX_MISC_CR3_TARGET 0x01ff0000 + #define VMX_MISC_VMWRITE_ALL 0x20000000 +-- +2.43.0 + diff --git a/0009-lib-fdt-elf-move-lib-fdt-elf-temp.o-and-their-deps-t.patch b/0009-lib-fdt-elf-move-lib-fdt-elf-temp.o-and-their-deps-t.patch new file mode 100644 index 0000000..9ee7104 --- /dev/null +++ b/0009-lib-fdt-elf-move-lib-fdt-elf-temp.o-and-their-deps-t.patch @@ -0,0 +1,70 @@ +From afb85cf1e8f165abf88de9d8a6df625692a753b1 Mon Sep 17 00:00:00 2001 +From: Michal Orzel <michal.orzel@amd.com> +Date: Fri, 2 Feb 2024 08:04:07 +0100 +Subject: [PATCH 09/10] lib{fdt,elf}: move lib{fdt,elf}-temp.o and their deps + to $(targets) + +At the moment, trying to run xencov read/reset (calling SYSCTL_coverage_op +under the hood) results in a crash. This is due to a profiler trying to +access data in the .init.* sections (libfdt for Arm and libelf for x86) +that are stripped after boot. Normally, the build system compiles any +*.init.o file without COV_FLAGS. However, these two libraries are +handled differently as sections will be renamed to init after linking. + +To override COV_FLAGS to empty for these libraries, lib{fdt,elf}.o were +added to nocov-y. This worked until e321576f4047 ("xen/build: start using +if_changed") that added lib{fdt,elf}-temp.o and their deps to extra-y. +This way, even though these objects appear as prerequisites of +lib{fdt,elf}.o and the settings should propagate to them, make can also +build them as a prerequisite of __build, in which case COV_FLAGS would +still have the unwanted flags. Fix it by switching to $(targets) instead. + +Also, for libfdt, append libfdt.o to nocov-y only if CONFIG_OVERLAY_DTB +is not set. Otherwise, there is no section renaming and we should be able +to run the coverage. + +Fixes: e321576f4047 ("xen/build: start using if_changed") +Signed-off-by: Michal Orzel <michal.orzel@amd.com> +Reviewed-by: Anthony PERARD <anthony.perard@citrix.com> +Acked-by: Jan Beulich <jbeulich@suse.com> +master commit: 79519fcfa0605bbf19d8c02b979af3a2c8afed68 +master date: 2024-01-23 12:02:44 +0100 +--- + xen/common/libelf/Makefile | 2 +- + xen/common/libfdt/Makefile | 4 ++-- + 2 files changed, 3 insertions(+), 3 deletions(-) + +diff --git a/xen/common/libelf/Makefile b/xen/common/libelf/Makefile +index 8a4522e4e1..917d12b006 100644 +--- a/xen/common/libelf/Makefile ++++ b/xen/common/libelf/Makefile +@@ -13,4 +13,4 @@ $(obj)/libelf.o: $(obj)/libelf-temp.o FORCE + $(obj)/libelf-temp.o: $(addprefix $(obj)/,$(libelf-objs)) FORCE + $(call if_changed,ld) + +-extra-y += libelf-temp.o $(libelf-objs) ++targets += libelf-temp.o $(libelf-objs) +diff --git a/xen/common/libfdt/Makefile b/xen/common/libfdt/Makefile +index 75aaefa2e3..4d14fd61ba 100644 +--- a/xen/common/libfdt/Makefile ++++ b/xen/common/libfdt/Makefile +@@ -2,9 +2,9 @@ include $(src)/Makefile.libfdt + + SECTIONS := text data $(SPECIAL_DATA_SECTIONS) + OBJCOPYFLAGS := $(foreach s,$(SECTIONS),--rename-section .$(s)=.init.$(s)) ++nocov-y += libfdt.o + + obj-y += libfdt.o +-nocov-y += libfdt.o + + CFLAGS-y += -I$(srctree)/include/xen/libfdt/ + +@@ -14,4 +14,4 @@ $(obj)/libfdt.o: $(obj)/libfdt-temp.o FORCE + $(obj)/libfdt-temp.o: $(addprefix $(obj)/,$(LIBFDT_OBJS)) FORCE + $(call if_changed,ld) + +-extra-y += libfdt-temp.o $(LIBFDT_OBJS) ++targets += libfdt-temp.o $(LIBFDT_OBJS) +-- +2.43.0 + diff --git a/0009-x86emul-rework-wrapping-of-libc-functions-in-test-an.patch b/0009-x86emul-rework-wrapping-of-libc-functions-in-test-an.patch deleted file mode 100644 index 4f9c0bb..0000000 --- a/0009-x86emul-rework-wrapping-of-libc-functions-in-test-an.patch +++ /dev/null @@ -1,245 +0,0 @@ -From 37f1d68fa34220600f1e4ec82af5da70127757e5 Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Fri, 18 Aug 2023 15:04:28 +0200 -Subject: [PATCH 09/55] x86emul: rework wrapping of libc functions in test and - fuzzing harnesses - -Our present approach is working fully behind the compiler's back. This -was found to not work with LTO. Employ ld's --wrap= option instead. Note -that while this makes the build work at least with new enough gcc (it -doesn't with gcc7, for example, due to tool chain side issues afaict), -according to my testing things still won't work when building the -fuzzing harness with afl-cc: While with the gcc7 tool chain I see afl-as -getting invoked, this does not happen with gcc13. Yet without using that -assembler wrapper the resulting binary will look uninstrumented to -afl-fuzz. - -While checking the resulting binaries I noticed that we've gained uses -of snprintf() and strstr(), which only just so happen to not cause any -problems. Add a wrappers for them as well. - -Since we don't have any actual uses of v{,sn}printf(), no definitions of -their wrappers appear (just yet). But I think we want -__wrap_{,sn}printf() to properly use __real_v{,sn}printf() right away, -which means we need delarations of the latter. - -Reported-by: Andrew Cooper <andrew.cooper3@citrix.com> -Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com> -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Tested-by: Andrew Cooper <andrew.cooper3@citrix.com> -Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> -(cherry picked from commit 6fba45ca3be1c5d46cddb1eaf371d9e69550b244) ---- - tools/fuzz/x86_instruction_emulator/Makefile | 6 ++- - tools/tests/x86_emulator/Makefile | 4 +- - tools/tests/x86_emulator/wrappers.c | 55 ++++++++++++++------ - tools/tests/x86_emulator/x86-emulate.h | 14 +++-- - 4 files changed, 53 insertions(+), 26 deletions(-) - -diff --git a/tools/fuzz/x86_instruction_emulator/Makefile b/tools/fuzz/x86_instruction_emulator/Makefile -index 13aa238503..c83959c847 100644 ---- a/tools/fuzz/x86_instruction_emulator/Makefile -+++ b/tools/fuzz/x86_instruction_emulator/Makefile -@@ -29,6 +29,8 @@ GCOV_FLAGS := --coverage - %-cov.o: %.c - $(CC) -c $(CFLAGS) $(GCOV_FLAGS) $< -o $@ - -+WRAPPED = $(shell sed -n 's,^ *WRAP(\([[:alnum:]_]*\));,\1,p' x86-emulate.h) -+ - x86-emulate.h: x86_emulate/x86_emulate.h - x86-emulate.o x86-emulate-cov.o: x86-emulate.h x86_emulate/x86_emulate.c - fuzz-emul.o fuzz-emul-cov.o wrappers.o: x86-emulate.h -@@ -37,10 +39,10 @@ x86-insn-fuzzer.a: fuzz-emul.o x86-emulate.o cpuid.o - $(AR) rc $@ $^ - - afl-harness: afl-harness.o fuzz-emul.o x86-emulate.o cpuid.o wrappers.o -- $(CC) $(CFLAGS) $^ -o $@ -+ $(CC) $(CFLAGS) $(addprefix -Wl$(comma)--wrap=,$(WRAPPED)) $^ -o $@ - - afl-harness-cov: afl-harness-cov.o fuzz-emul-cov.o x86-emulate-cov.o cpuid.o wrappers.o -- $(CC) $(CFLAGS) $(GCOV_FLAGS) $^ -o $@ -+ $(CC) $(CFLAGS) $(GCOV_FLAGS) $(addprefix -Wl$(comma)--wrap=,$(WRAPPED)) $^ -o $@ - - # Common targets - .PHONY: all -diff --git a/tools/tests/x86_emulator/Makefile b/tools/tests/x86_emulator/Makefile -index bd82598f97..a2fd6607c6 100644 ---- a/tools/tests/x86_emulator/Makefile -+++ b/tools/tests/x86_emulator/Makefile -@@ -250,8 +250,10 @@ xop.h avx512f.h: simd-fma.c - - endif # 32-bit override - -+WRAPPED := $(shell sed -n 's,^ *WRAP(\([[:alnum:]_]*\));,\1,p' x86-emulate.h) -+ - $(TARGET): x86-emulate.o cpuid.o test_x86_emulator.o evex-disp8.o predicates.o wrappers.o -- $(HOSTCC) $(HOSTCFLAGS) -o $@ $^ -+ $(HOSTCC) $(HOSTCFLAGS) $(addprefix -Wl$(comma)--wrap=,$(WRAPPED)) -o $@ $^ - - .PHONY: clean - clean: -diff --git a/tools/tests/x86_emulator/wrappers.c b/tools/tests/x86_emulator/wrappers.c -index eba7cc93c5..3829a6f416 100644 ---- a/tools/tests/x86_emulator/wrappers.c -+++ b/tools/tests/x86_emulator/wrappers.c -@@ -1,78 +1,103 @@ - #include <stdarg.h> - --#define WRAP(x) typeof(x) emul_##x -+#define WRAP(x) typeof(x) __wrap_ ## x, __real_ ## x - #include "x86-emulate.h" - --size_t emul_fwrite(const void *src, size_t sz, size_t n, FILE *f) -+size_t __wrap_fwrite(const void *src, size_t sz, size_t n, FILE *f) - { - emul_save_fpu_state(); -- sz = fwrite(src, sz, n, f); -+ sz = __real_fwrite(src, sz, n, f); - emul_restore_fpu_state(); - - return sz; - } - --int emul_memcmp(const void *p1, const void *p2, size_t sz) -+int __wrap_memcmp(const void *p1, const void *p2, size_t sz) - { - int rc; - - emul_save_fpu_state(); -- rc = memcmp(p1, p2, sz); -+ rc = __real_memcmp(p1, p2, sz); - emul_restore_fpu_state(); - - return rc; - } - --void *emul_memcpy(void *dst, const void *src, size_t sz) -+void *__wrap_memcpy(void *dst, const void *src, size_t sz) - { - emul_save_fpu_state(); -- memcpy(dst, src, sz); -+ __real_memcpy(dst, src, sz); - emul_restore_fpu_state(); - - return dst; - } - --void *emul_memset(void *dst, int c, size_t sz) -+void *__wrap_memset(void *dst, int c, size_t sz) - { - emul_save_fpu_state(); -- memset(dst, c, sz); -+ __real_memset(dst, c, sz); - emul_restore_fpu_state(); - - return dst; - } - --int emul_printf(const char *fmt, ...) -+int __wrap_printf(const char *fmt, ...) - { - va_list varg; - int rc; - - emul_save_fpu_state(); - va_start(varg, fmt); -- rc = vprintf(fmt, varg); -+ rc = __real_vprintf(fmt, varg); - va_end(varg); - emul_restore_fpu_state(); - - return rc; - } - --int emul_putchar(int c) -+int __wrap_putchar(int c) - { - int rc; - - emul_save_fpu_state(); -- rc = putchar(c); -+ rc = __real_putchar(c); - emul_restore_fpu_state(); - - return rc; - } - --int emul_puts(const char *str) -+int __wrap_puts(const char *str) - { - int rc; - - emul_save_fpu_state(); -- rc = puts(str); -+ rc = __real_puts(str); - emul_restore_fpu_state(); - - return rc; - } -+ -+int __wrap_snprintf(char *buf, size_t n, const char *fmt, ...) -+{ -+ va_list varg; -+ int rc; -+ -+ emul_save_fpu_state(); -+ va_start(varg, fmt); -+ rc = __real_vsnprintf(buf, n, fmt, varg); -+ va_end(varg); -+ emul_restore_fpu_state(); -+ -+ return rc; -+} -+ -+char *__wrap_strstr(const char *s1, const char *s2) -+{ -+ char *s; -+ -+ emul_save_fpu_state(); -+ s = __real_strstr(s1, s2); -+ emul_restore_fpu_state(); -+ -+ return s; -+} -diff --git a/tools/tests/x86_emulator/x86-emulate.h b/tools/tests/x86_emulator/x86-emulate.h -index 19bea9c38d..58760f096d 100644 ---- a/tools/tests/x86_emulator/x86-emulate.h -+++ b/tools/tests/x86_emulator/x86-emulate.h -@@ -29,9 +29,7 @@ - #ifdef EOF - # error "Must not include <stdio.h> before x86-emulate.h" - #endif --#ifdef WRAP --# include <stdio.h> --#endif -+#include <stdio.h> - - #include <xen/xen.h> - -@@ -85,11 +83,7 @@ void emul_restore_fpu_state(void); - * around the actual function. - */ - #ifndef WRAP --# if 0 /* This only works for explicit calls, not for compiler generated ones. */ --# define WRAP(x) typeof(x) x asm("emul_" #x) --# else --# define WRAP(x) asm(".equ " #x ", emul_" #x) --# endif -+# define WRAP(x) typeof(x) __wrap_ ## x - #endif - - WRAP(fwrite); -@@ -99,6 +93,10 @@ WRAP(memset); - WRAP(printf); - WRAP(putchar); - WRAP(puts); -+WRAP(snprintf); -+WRAP(strstr); -+WRAP(vprintf); -+WRAP(vsnprintf); - - #undef WRAP - --- -2.42.0 - diff --git a/0010-rombios-Work-around-GCC-issue-99578.patch b/0010-rombios-Work-around-GCC-issue-99578.patch deleted file mode 100644 index 3995f02..0000000 --- a/0010-rombios-Work-around-GCC-issue-99578.patch +++ /dev/null @@ -1,43 +0,0 @@ -From ae1045c42954772e48862162d0e95fbc9393c91e Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Thu, 17 Aug 2023 21:32:53 +0100 -Subject: [PATCH 10/55] rombios: Work around GCC issue 99578 - -GCC 12 objects to pointers derived from a constant: - - util.c: In function 'find_rsdp': - util.c:429:16: error: array subscript 0 is outside array bounds of 'uint16_t[0]' {aka 'short unsigned int[]'} [-Werror=array-bounds] - 429 | ebda_seg = *(uint16_t *)ADDR_FROM_SEG_OFF(0x40, 0xe); - cc1: all warnings being treated as errors - -This is a GCC bug, but work around it rather than turning array-bounds -checking off generally. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Acked-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit e35138a2ffbe1fe71edaaaaae71063dc545a8416) ---- - tools/firmware/rombios/32bit/util.c | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/tools/firmware/rombios/32bit/util.c b/tools/firmware/rombios/32bit/util.c -index 6c1c480514..a47e000a26 100644 ---- a/tools/firmware/rombios/32bit/util.c -+++ b/tools/firmware/rombios/32bit/util.c -@@ -424,10 +424,10 @@ static struct acpi_20_rsdp *__find_rsdp(const void *start, unsigned int len) - struct acpi_20_rsdp *find_rsdp(void) - { - struct acpi_20_rsdp *rsdp; -- uint16_t ebda_seg; -+ uint16_t *volatile /* GCC issue 99578 */ ebda_seg = -+ ADDR_FROM_SEG_OFF(0x40, 0xe); - -- ebda_seg = *(uint16_t *)ADDR_FROM_SEG_OFF(0x40, 0xe); -- rsdp = __find_rsdp((void *)(ebda_seg << 16), 1024); -+ rsdp = __find_rsdp((void *)(*ebda_seg << 16), 1024); - if (!rsdp) - rsdp = __find_rsdp((void *)0xE0000, 0x20000); - --- -2.42.0 - diff --git a/0010-x86-p2m-pt-fix-off-by-one-in-entry-check-assert.patch b/0010-x86-p2m-pt-fix-off-by-one-in-entry-check-assert.patch new file mode 100644 index 0000000..ba99063 --- /dev/null +++ b/0010-x86-p2m-pt-fix-off-by-one-in-entry-check-assert.patch @@ -0,0 +1,36 @@ +From 091466ba55d1e2e75738f751818ace2e3ed08ccf Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> +Date: Fri, 2 Feb 2024 08:04:33 +0100 +Subject: [PATCH 10/10] x86/p2m-pt: fix off by one in entry check assert +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The MMIO RO rangeset overlap check is bogus: the rangeset is inclusive so the +passed end mfn should be the last mfn to be mapped (not last + 1). + +Fixes: 6fa1755644d0 ('amd/npt/shadow: replace assert that prevents creating 2M/1G MMIO entries') +Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> +Reviewed-by: George Dunlap <george.dunlap@cloud.com> +master commit: 610775d0dd61c1bd2f4720c755986098e6a5bafd +master date: 2024-01-25 16:09:04 +0100 +--- + xen/arch/x86/mm/p2m-pt.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xen/arch/x86/mm/p2m-pt.c b/xen/arch/x86/mm/p2m-pt.c +index eaba2b0fb4..f02ebae372 100644 +--- a/xen/arch/x86/mm/p2m-pt.c ++++ b/xen/arch/x86/mm/p2m-pt.c +@@ -564,7 +564,7 @@ static void check_entry(mfn_t mfn, p2m_type_t new, p2m_type_t old, + if ( new == p2m_mmio_direct ) + ASSERT(!mfn_eq(mfn, INVALID_MFN) && + !rangeset_overlaps_range(mmio_ro_ranges, mfn_x(mfn), +- mfn_x(mfn) + (1ul << order))); ++ mfn_x(mfn) + (1UL << order) - 1)); + else if ( p2m_allows_invalid_mfn(new) || new == p2m_invalid || + new == p2m_mmio_dm ) + ASSERT(mfn_valid(mfn) || mfn_eq(mfn, INVALID_MFN)); +-- +2.43.0 + diff --git a/0011-rombios-Avoid-using-K-R-function-syntax.patch b/0011-rombios-Avoid-using-K-R-function-syntax.patch deleted file mode 100644 index 0bd761f..0000000 --- a/0011-rombios-Avoid-using-K-R-function-syntax.patch +++ /dev/null @@ -1,74 +0,0 @@ -From 24487fec3bbebbc1fd3f00d16bca7fb0f56a5f30 Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Fri, 18 Aug 2023 10:47:46 +0100 -Subject: [PATCH 11/55] rombios: Avoid using K&R function syntax - -Clang-15 complains: - - tcgbios.c:598:25: error: a function declaration without a prototype is deprecated in all versions of C [-Werror,-Wstrict-prototypes] - void tcpa_calling_int19h() - ^ - void - -C2x formally removes K&R syntax. The declarations for these functions in -32bitprotos.h are already ANSI compatible. Update the definitions to match. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Acked-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit a562afa5679d4a7ceb9cb9222fec1fea9a61f738) ---- - tools/firmware/rombios/32bit/tcgbios/tcgbios.c | 10 +++++----- - 1 file changed, 5 insertions(+), 5 deletions(-) - -diff --git a/tools/firmware/rombios/32bit/tcgbios/tcgbios.c b/tools/firmware/rombios/32bit/tcgbios/tcgbios.c -index fa22c4460a..ad0eac0d20 100644 ---- a/tools/firmware/rombios/32bit/tcgbios/tcgbios.c -+++ b/tools/firmware/rombios/32bit/tcgbios/tcgbios.c -@@ -595,7 +595,7 @@ static void tcpa_add_measurement(uint32_t pcrIndex, - /* - * Add measurement to log about call of int 19h - */ --void tcpa_calling_int19h() -+void tcpa_calling_int19h(void) - { - tcpa_add_measurement(4, EV_ACTION, 0); - } -@@ -603,7 +603,7 @@ void tcpa_calling_int19h() - /* - * Add measurement to log about retuning from int 19h - */ --void tcpa_returned_int19h() -+void tcpa_returned_int19h(void) - { - tcpa_add_measurement(4, EV_ACTION, 1); - } -@@ -611,7 +611,7 @@ void tcpa_returned_int19h() - /* - * Add event separators for PCRs 0 to 7; specs 8.2.3 - */ --void tcpa_add_event_separators() -+void tcpa_add_event_separators(void) - { - uint32_t pcrIndex = 0; - while (pcrIndex <= 7) { -@@ -624,7 +624,7 @@ void tcpa_add_event_separators() - /* - * Add a wake event to the log - */ --void tcpa_wake_event() -+void tcpa_wake_event(void) - { - tcpa_add_measurement_to_log(6, - EV_ACTION, -@@ -659,7 +659,7 @@ void tcpa_add_bootdevice(uint32_t bootcd, uint32_t bootdrv) - * Add measurement to the log about option rom scan - * 10.4.3 : action 14 - */ --void tcpa_start_option_rom_scan() -+void tcpa_start_option_rom_scan(void) - { - tcpa_add_measurement(2, EV_ACTION, 14); - } --- -2.42.0 - diff --git a/0012-rombios-Remove-the-use-of-egrep.patch b/0012-rombios-Remove-the-use-of-egrep.patch deleted file mode 100644 index 44702b4..0000000 --- a/0012-rombios-Remove-the-use-of-egrep.patch +++ /dev/null @@ -1,34 +0,0 @@ -From e418a77295e6b512d212b57123c11e4d4fb23e8c Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Fri, 18 Aug 2023 11:05:00 +0100 -Subject: [PATCH 12/55] rombios: Remove the use of egrep - -As the Alpine 3.18 container notes: - - egrep: warning: egrep is obsolescent; using grep -E - -Adjust it. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Acked-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit 5ddac3c2852ecc120acab86fc403153a2097c5dc) ---- - tools/firmware/rombios/32bit/Makefile | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/tools/firmware/rombios/32bit/Makefile b/tools/firmware/rombios/32bit/Makefile -index c058c71551..50d45647c2 100644 ---- a/tools/firmware/rombios/32bit/Makefile -+++ b/tools/firmware/rombios/32bit/Makefile -@@ -26,7 +26,7 @@ $(TARGET): 32bitbios_all.o - 32bitbios_all.o: 32bitbios.o tcgbios/tcgbiosext.o util.o pmm.o - $(LD) $(LDFLAGS_DIRECT) -s -r $^ -o 32bitbios_all.o - @nm 32bitbios_all.o | \ -- egrep '^ +U ' >/dev/null && { \ -+ grep -E '^ +U ' >/dev/null && { \ - echo "There are undefined symbols in the BIOS:"; \ - nm -u 32bitbios_all.o; \ - exit 11; \ --- -2.42.0 - diff --git a/0013-CI-Resync-FreeBSD-config-with-staging.patch b/0013-CI-Resync-FreeBSD-config-with-staging.patch deleted file mode 100644 index dcd867b..0000000 --- a/0013-CI-Resync-FreeBSD-config-with-staging.patch +++ /dev/null @@ -1,62 +0,0 @@ -From f00d56309533427981f09ef2614f1bae4bcab62e Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Fri, 17 Feb 2023 11:16:32 +0000 -Subject: [PATCH 13/55] CI: Resync FreeBSD config with staging -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -CI: Update FreeBSD to 13.1 - -Also print the compiler version before starting. It's not easy to find -otherwise, and does change from time to time. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Anthony PERARD <anthony.perard@citrix.com> -(cherry picked from commit 5e7667ea2dd33e0e5e0f3a96db37fdb4ecd98fba) - -CI: Update FreeBSD to 13.2 - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Acked-by: Stefano Stabellini <sstabellini@kernel.org> -(cherry picked from commit f872a624cbf92de9944483eea7674ef80ced1380) - -CI: Update FreeBSD to 12.4 - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> -(cherry picked from commit a73560896ce3c513460f26bd1c205060d6ec4f8a) ---- - .cirrus.yml | 5 +++-- - 1 file changed, 3 insertions(+), 2 deletions(-) - -diff --git a/.cirrus.yml b/.cirrus.yml -index c38333e736..7e0beb200d 100644 ---- a/.cirrus.yml -+++ b/.cirrus.yml -@@ -10,19 +10,20 @@ freebsd_template: &FREEBSD_TEMPLATE - libxml2 glib git - - build_script: -+ - cc --version - - ./configure --with-system-seabios=/usr/local/share/seabios/bios.bin - - gmake -j`sysctl -n hw.ncpu` clang=y - - task: - name: 'FreeBSD 12' - freebsd_instance: -- image_family: freebsd-12-3 -+ image_family: freebsd-12-4 - << : *FREEBSD_TEMPLATE - - task: - name: 'FreeBSD 13' - freebsd_instance: -- image_family: freebsd-13-0 -+ image_family: freebsd-13-2 - << : *FREEBSD_TEMPLATE - - task: --- -2.42.0 - diff --git a/0014-tools-vchan-Fix-Wsingle-bit-bitfield-constant-conver.patch b/0014-tools-vchan-Fix-Wsingle-bit-bitfield-constant-conver.patch deleted file mode 100644 index 6e29490..0000000 --- a/0014-tools-vchan-Fix-Wsingle-bit-bitfield-constant-conver.patch +++ /dev/null @@ -1,43 +0,0 @@ -From 052a8d24bc670ab6503e21dfd2fb8bccfc22aa73 Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Tue, 8 Aug 2023 14:53:42 +0100 -Subject: [PATCH 14/55] tools/vchan: Fix - -Wsingle-bit-bitfield-constant-conversion - -Gitlab reports: - - node.c:158:17: error: implicit truncation from 'int' to a one-bit wide bit-field changes value from 1 to -1 [-Werror,-Wsingle-bit-bitfield-constant-conversion] - - ctrl->blocking = 1; - ^ ~ - 1 error generated. - make[4]: *** [/builds/xen-project/people/andyhhp/xen/tools/vchan/../../tools/Rules.mk:188: node.o] Error 1 - -In Xen 4.18, this was fixed with c/s 99ab02f63ea8 ("tools: convert bitfields -to unsigned type") but this is an ABI change which can't be backported. - -Swich 1 for -1 to provide a minimally invasive way to fix the build. - -No functional change. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> ---- - tools/vchan/node.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/tools/vchan/node.c b/tools/vchan/node.c -index f1638f013d..a28293b720 100644 ---- a/tools/vchan/node.c -+++ b/tools/vchan/node.c -@@ -155,7 +155,7 @@ int main(int argc, char **argv) - perror("libxenvchan_*_init"); - exit(1); - } -- ctrl->blocking = 1; -+ ctrl->blocking = -1; - - srand(seed); - fprintf(stderr, "seed=%d\n", seed); --- -2.42.0 - diff --git a/0015-xen-vcpu-ignore-VCPU_SSHOTTMR_future.patch b/0015-xen-vcpu-ignore-VCPU_SSHOTTMR_future.patch deleted file mode 100644 index 81e010b..0000000 --- a/0015-xen-vcpu-ignore-VCPU_SSHOTTMR_future.patch +++ /dev/null @@ -1,143 +0,0 @@ -From 7b5155a79ea946dd513847d4e7ad2b7e6a4ebb73 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Tue, 5 Sep 2023 08:45:29 +0200 -Subject: [PATCH 15/55] xen/vcpu: ignore VCPU_SSHOTTMR_future -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -The usage of VCPU_SSHOTTMR_future in Linux prior to 4.7 is bogus. -When the hypervisor returns -ETIME (timeout in the past) Linux keeps -retrying to setup the timer with a higher timeout instead of -self-injecting a timer interrupt. - -On boxes without any hardware assistance for logdirty we have seen HVM -Linux guests < 4.7 with 32vCPUs give up trying to setup the timer when -logdirty is enabled: - -CE: Reprogramming failure. Giving up -CE: xen increased min_delta_ns to 1000000 nsec -CE: Reprogramming failure. Giving up -CE: Reprogramming failure. Giving up -CE: xen increased min_delta_ns to 506250 nsec -CE: xen increased min_delta_ns to 759375 nsec -CE: xen increased min_delta_ns to 1000000 nsec -CE: Reprogramming failure. Giving up -CE: Reprogramming failure. Giving up -CE: Reprogramming failure. Giving up -Freezing user space processes ... -INFO: rcu_sched detected stalls on CPUs/tasks: { 14} (detected by 10, t=60002 jiffies, g=4006, c=4005, q=14130) -Task dump for CPU 14: -swapper/14 R running task 0 0 1 0x00000000 -Call Trace: - [<ffffffff90160f5d>] ? rcu_eqs_enter_common.isra.30+0x3d/0xf0 - [<ffffffff907b9bde>] ? default_idle+0x1e/0xd0 - [<ffffffff90039570>] ? arch_cpu_idle+0x20/0xc0 - [<ffffffff9010820a>] ? cpu_startup_entry+0x14a/0x1e0 - [<ffffffff9005d3a7>] ? start_secondary+0x1f7/0x270 - [<ffffffff900000d5>] ? start_cpu+0x5/0x14 -INFO: rcu_sched detected stalls on CPUs/tasks: { 26} (detected by 24, t=60002 jiffies, g=6922, c=6921, q=7013) -Task dump for CPU 26: -swapper/26 R running task 0 0 1 0x00000000 -Call Trace: - [<ffffffff90160f5d>] ? rcu_eqs_enter_common.isra.30+0x3d/0xf0 - [<ffffffff907b9bde>] ? default_idle+0x1e/0xd0 - [<ffffffff90039570>] ? arch_cpu_idle+0x20/0xc0 - [<ffffffff9010820a>] ? cpu_startup_entry+0x14a/0x1e0 - [<ffffffff9005d3a7>] ? start_secondary+0x1f7/0x270 - [<ffffffff900000d5>] ? start_cpu+0x5/0x14 -INFO: rcu_sched detected stalls on CPUs/tasks: { 26} (detected by 24, t=60002 jiffies, g=8499, c=8498, q=7664) -Task dump for CPU 26: -swapper/26 R running task 0 0 1 0x00000000 -Call Trace: - [<ffffffff90160f5d>] ? rcu_eqs_enter_common.isra.30+0x3d/0xf0 - [<ffffffff907b9bde>] ? default_idle+0x1e/0xd0 - [<ffffffff90039570>] ? arch_cpu_idle+0x20/0xc0 - [<ffffffff9010820a>] ? cpu_startup_entry+0x14a/0x1e0 - [<ffffffff9005d3a7>] ? start_secondary+0x1f7/0x270 - [<ffffffff900000d5>] ? start_cpu+0x5/0x14 - -Thus leading to CPU stalls and a broken system as a result. - -Workaround this bogus usage by ignoring the VCPU_SSHOTTMR_future in -the hypervisor. Old Linux versions are the only ones known to have -(wrongly) attempted to use the flag, and ignoring it is compatible -with the behavior expected by any guests setting that flag. - -Note the usage of the flag has been removed from Linux by commit: - -c06b6d70feb3 xen/x86: don't lose event interrupts - -Which landed in Linux 4.7. - -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Acked-by: Henry Wang <Henry.Wang@arm.com> # CHANGELOG -Acked-by: Jan Beulich <jbeulich@suse.com> -master commit: 19c6cbd90965b1440bd551069373d6fa3f2f365d -master date: 2023-05-03 13:36:05 +0200 ---- - CHANGELOG.md | 6 ++++++ - xen/common/domain.c | 13 ++++++++++--- - xen/include/public/vcpu.h | 5 ++++- - 3 files changed, 20 insertions(+), 4 deletions(-) - -diff --git a/CHANGELOG.md b/CHANGELOG.md -index 7f4d0f25e9..bb0eceb69a 100644 ---- a/CHANGELOG.md -+++ b/CHANGELOG.md -@@ -4,6 +4,12 @@ Notable changes to Xen will be documented in this file. - - The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) - -+## [4.17.3](https://xenbits.xen.org/gitweb/?p=xen.git;a=shortlog;h=RELEASE-4.17.3) -+ -+### Changed -+ - Ignore VCPUOP_set_singleshot_timer's VCPU_SSHOTTMR_future flag. The only -+ known user doesn't use it properly, leading to in-guest breakage. -+ - ## [4.17.0](https://xenbits.xen.org/gitweb/?p=xen.git;a=shortlog;h=RELEASE-4.17.0) - 2022-12-12 - - ### Changed -diff --git a/xen/common/domain.c b/xen/common/domain.c -index 53f7e734fe..30c2279673 100644 ---- a/xen/common/domain.c -+++ b/xen/common/domain.c -@@ -1691,9 +1691,16 @@ long common_vcpu_op(int cmd, struct vcpu *v, XEN_GUEST_HANDLE_PARAM(void) arg) - if ( copy_from_guest(&set, arg, 1) ) - return -EFAULT; - -- if ( (set.flags & VCPU_SSHOTTMR_future) && -- (set.timeout_abs_ns < NOW()) ) -- return -ETIME; -+ if ( set.timeout_abs_ns < NOW() ) -+ { -+ /* -+ * Simplify the logic if the timeout has already expired and just -+ * inject the event. -+ */ -+ stop_timer(&v->singleshot_timer); -+ send_timer_event(v); -+ break; -+ } - - migrate_timer(&v->singleshot_timer, smp_processor_id()); - set_timer(&v->singleshot_timer, set.timeout_abs_ns); -diff --git a/xen/include/public/vcpu.h b/xen/include/public/vcpu.h -index 81a3b3a743..a836b264a9 100644 ---- a/xen/include/public/vcpu.h -+++ b/xen/include/public/vcpu.h -@@ -150,7 +150,10 @@ typedef struct vcpu_set_singleshot_timer vcpu_set_singleshot_timer_t; - DEFINE_XEN_GUEST_HANDLE(vcpu_set_singleshot_timer_t); - - /* Flags to VCPUOP_set_singleshot_timer. */ -- /* Require the timeout to be in the future (return -ETIME if it's passed). */ -+ /* -+ * Request the timeout to be in the future (return -ETIME if it's passed) -+ * but can be ignored by the hypervisor. -+ */ - #define _VCPU_SSHOTTMR_future (0) - #define VCPU_SSHOTTMR_future (1U << _VCPU_SSHOTTMR_future) - --- -2.42.0 - diff --git a/0016-x86-head-check-base-address-alignment.patch b/0016-x86-head-check-base-address-alignment.patch deleted file mode 100644 index 2b9cead..0000000 --- a/0016-x86-head-check-base-address-alignment.patch +++ /dev/null @@ -1,85 +0,0 @@ -From e5f9987d5f63ecc3cc9884c614aca699a41e7ca7 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Tue, 5 Sep 2023 08:46:28 +0200 -Subject: [PATCH 16/55] x86/head: check base address alignment -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Ensure that the base address is 2M aligned, or else the page table -entries created would be corrupt as reserved bits on the PDE end up -set. - -We have encountered a broken firmware where grub2 would end up loading -Xen at a non 2M aligned region when using the multiboot2 protocol, and -that caused a very difficult to debug triple fault. - -If the alignment is not as required by the page tables print an error -message and stop the boot. Also add a build time check that the -calculation of symbol offsets don't break alignment of passed -addresses. - -The check could be performed earlier, but so far the alignment is -required by the page tables, and hence feels more natural that the -check lives near to the piece of code that requires it. - -Note that when booted as an EFI application from the PE entry point -the alignment check is already performed by -efi_arch_load_addr_check(), and hence there's no need to add another -check at the point where page tables get built in -efi_arch_memory_setup(). - -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -master commit: 0946068e7faea22868c577d7afa54ba4970ff520 -master date: 2023-05-03 13:36:25 +0200 ---- - xen/arch/x86/boot/head.S | 14 ++++++++++++++ - 1 file changed, 14 insertions(+) - -diff --git a/xen/arch/x86/boot/head.S b/xen/arch/x86/boot/head.S -index 245c859dd7..6bc64c9e86 100644 ---- a/xen/arch/x86/boot/head.S -+++ b/xen/arch/x86/boot/head.S -@@ -1,3 +1,4 @@ -+#include <xen/lib.h> - #include <xen/multiboot.h> - #include <xen/multiboot2.h> - #include <public/xen.h> -@@ -121,6 +122,7 @@ multiboot2_header: - .Lbad_ldr_nst: .asciz "ERR: EFI SystemTable is not provided by bootloader!" - .Lbad_ldr_nih: .asciz "ERR: EFI ImageHandle is not provided by bootloader!" - .Lbad_efi_msg: .asciz "ERR: EFI IA-32 platforms are not supported!" -+.Lbad_alg_msg: .asciz "ERR: Xen must be loaded at a 2Mb boundary!" - - .section .init.data, "aw", @progbits - .align 4 -@@ -146,6 +148,9 @@ bad_cpu: - not_multiboot: - mov $sym_offs(.Lbad_ldr_msg), %ecx - jmp .Lget_vtb -+.Lnot_aligned: -+ mov $sym_offs(.Lbad_alg_msg), %ecx -+ jmp .Lget_vtb - .Lmb2_no_st: - /* - * Here we are on EFI platform. vga_text_buffer was zapped earlier -@@ -673,6 +678,15 @@ trampoline_setup: - cmp %edi, %eax - jb 1b - -+ .if !IS_ALIGNED(sym_offs(0), 1 << L2_PAGETABLE_SHIFT) -+ .error "Symbol offset calculation breaks alignment" -+ .endif -+ -+ /* Check that the image base is aligned. */ -+ lea sym_esi(_start), %eax -+ test $(1 << L2_PAGETABLE_SHIFT) - 1, %eax -+ jnz .Lnot_aligned -+ - /* Map Xen into the higher mappings using 2M superpages. */ - lea _PAGE_PSE + PAGE_HYPERVISOR_RWX + sym_esi(_start), %eax - mov $sym_offs(_start), %ecx /* %eax = PTE to write ^ */ --- -2.42.0 - diff --git a/0017-xenalyze-Handle-start-of-day-RUNNING-transitions.patch b/0017-xenalyze-Handle-start-of-day-RUNNING-transitions.patch deleted file mode 100644 index a4501a3..0000000 --- a/0017-xenalyze-Handle-start-of-day-RUNNING-transitions.patch +++ /dev/null @@ -1,275 +0,0 @@ -From f04295dd802fb6cd43a02ec59a5964b2c5950fe1 Mon Sep 17 00:00:00 2001 -From: George Dunlap <george.dunlap@cloud.com> -Date: Tue, 5 Sep 2023 08:47:14 +0200 -Subject: [PATCH 17/55] xenalyze: Handle start-of-day ->RUNNING transitions - -A recent xentrace highlighted an unhandled corner case in the vcpu -"start-of-day" logic, if the trace starts after the last running -> -non-running transition, but before the first non-running -> running -transition. Because start-of-day wasn't handled, vcpu_next_update() -was expecting p->current to be NULL, and tripping out with the -following error message when it wasn't: - -vcpu_next_update: FATAL: p->current not NULL! (d32768dv$p, runstate RUNSTATE_INIT) - -where 32768 is the DEFAULT_DOMAIN, and $p is the pcpu number. - -Instead of calling vcpu_start() piecemeal throughout -sched_runstate_process(), call it at the top of the function if the -vcpu in question is still in RUNSTATE_INIT, so that we can handle all -the cases in one place. - -Sketch out at the top of the function all cases which we need to -handle, and what to do in those cases. Some transitions tell us where -v is running; some transitions tell us about what is (or is not) -running on p; some transitions tell us neither. - -If a transition tells us where v is now running, update its state; -otherwise leave it in INIT, in order to avoid having to deal with TSC -skew on start-up. - -If a transition tells us what is or is not running on p, update -p->current (either to v or NULL). Otherwise leave it alone. - -If neither, do nothing. - -Reifying those rules: - -- If we're continuing to run, set v to RUNNING, and use p->first_tsc - as the runstate time. - -- If we're starting to run, set v to RUNNING, and use ri->tsc as the - runstate time. - -- If v is being deschedled, leave v in the INIT state to avoid dealing - with TSC skew; but set p->current to NULL so that whatever is - scheduled next won't trigger the assert in vcpu_next_update(). - -- If a vcpu is waking up (switching from one non-runnable state to - another non-runnable state), leave v in INIT, and p in whatever - state it's in (which may be the default domain, or some other vcpu - which has already run). - -While here, fix the comment above vcpu_start; it's called when the -vcpu state is INIT, not when current is the default domain. - -Signed-off-by: George Dunlap <george.dunlap@cloud.com> -Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> -Acked-by: Anthony PERARD <anthony.perard@citrix.com> -master commit: aab4b38b5d77e3c65f44bacd56427a85b7392a11 -master date: 2023-06-30 11:25:33 +0100 ---- - tools/xentrace/xenalyze.c | 159 ++++++++++++++++++++++++-------------- - 1 file changed, 101 insertions(+), 58 deletions(-) - -diff --git a/tools/xentrace/xenalyze.c b/tools/xentrace/xenalyze.c -index e7ec284eea..9b4b62c82f 100644 ---- a/tools/xentrace/xenalyze.c -+++ b/tools/xentrace/xenalyze.c -@@ -6885,39 +6885,86 @@ void vcpu_next_update(struct pcpu_info *p, struct vcpu_data *next, tsc_t tsc) - p->lost_record.seen_valid_schedule = 1; - } - --/* If current is the default domain, we're fixing up from something -- * like start-of-day. Update what we can. */ --void vcpu_start(struct pcpu_info *p, struct vcpu_data *v) { -- /* If vcpus are created, or first show up, in a "dead zone", this will -- * fail. */ -- if( !p->current || p->current->d->did != DEFAULT_DOMAIN) { -- fprintf(stderr, "Strange, p->current not default domain!\n"); -- error(ERR_FILE, NULL); -- return; -- } -+/* -+ * If the vcpu in question is in state INIT, we're fixing up from something -+ * like start-of-day. Update what we can. -+ */ -+void vcpu_start(struct pcpu_info *p, struct vcpu_data *v, -+ int old_runstate, int new_runstate, tsc_t ri_tsc) { -+ tsc_t tsc; -+ -+ /* -+ * -+ * Cases: -+ * running -> running: -+ * v -> running, using p->first_tsc -+ * {runnable, blocked} -> running: -+ * v -> running, using ri->tsc -+ * running -> {runnable, blocked}: -+ * Leave v INIT, but clear p->current in case another vcpu is scheduled -+ * blocked -> runnable: -+ * Leave INIT, and also leave p->current, since we still don't know who's scheduled here -+ */ -+ -+ /* -+ * NB that a vcpu won't come out of INIT until it starts running somewhere. -+ * If this event is pcpu that has already seen a scheduling event, p->current -+ * should be null; if this is the first scheduling event on this pcpu, -+ * p->current should be the default domain. -+ */ -+ if( old_runstate == RUNSTATE_RUNNING ) { -+ if ( !p->current || p->current->d->did != DEFAULT_DOMAIN) { -+ fprintf(stderr, "Strange, p->current not default domain!\n"); -+ error(ERR_FILE, NULL); -+ return; - -- if(!p->first_tsc) { -- fprintf(stderr, "Strange, p%d first_tsc 0!\n", p->pid); -- error(ERR_FILE, NULL); -+ } -+ -+ if(!p->first_tsc) { -+ fprintf(stderr, "Strange, p%d first_tsc 0!\n", p->pid); -+ error(ERR_FILE, NULL); -+ } -+ -+ if(p->first_tsc <= p->current->runstate.tsc) { -+ fprintf(stderr, "Strange, first_tsc %llx < default_domain runstate tsc %llx!\n", -+ p->first_tsc, -+ p->current->runstate.tsc); -+ error(ERR_FILE, NULL); -+ } -+ -+ /* Change default domain to 'queued' */ -+ runstate_update(p->current, RUNSTATE_QUEUED, p->first_tsc); -+ -+ /* -+ * Set current to NULL, so that if another vcpu (not in INIT) -+ * is scheduled here, we don't trip over the check in -+ * vcpu_next_update() -+ */ -+ p->current = NULL; - } - -- if(p->first_tsc <= p->current->runstate.tsc) { -- fprintf(stderr, "Strange, first_tsc %llx < default_domain runstate tsc %llx!\n", -- p->first_tsc, -- p->current->runstate.tsc); -- error(ERR_FILE, NULL); -+ /* TSC skew at start-of-day is hard to deal with. Don't -+ * bring a vcpu out of INIT until it's seen to be actually -+ * running somewhere. */ -+ if ( new_runstate != RUNSTATE_RUNNING ) { -+ fprintf(warn, "First schedule for d%dv%d doesn't take us into a running state; leaving INIT\n", -+ v->d->did, v->vid); -+ -+ return; - } - -- /* Change default domain to 'queued' */ -- runstate_update(p->current, RUNSTATE_QUEUED, p->first_tsc); -+ tsc = ri_tsc; -+ if ( old_runstate == RUNSTATE_RUNNING ) { -+ /* FIXME: Copy over data from the default domain this interval */ -+ fprintf(warn, "Using first_tsc for d%dv%d (%lld cycles)\n", -+ v->d->did, v->vid, p->last_tsc - p->first_tsc); - -- /* FIXME: Copy over data from the default domain this interval */ -- fprintf(warn, "Using first_tsc for d%dv%d (%lld cycles)\n", -- v->d->did, v->vid, p->last_tsc - p->first_tsc); -+ tsc = p->first_tsc; -+ } - - /* Simulate the time since the first tsc */ -- runstate_update(v, RUNSTATE_RUNNING, p->first_tsc); -- p->time.tsc = p->first_tsc; -+ runstate_update(v, RUNSTATE_RUNNING, tsc); -+ p->time.tsc = tsc; - p->current = v; - pcpu_string_draw(p); - v->p = p; -@@ -7021,6 +7068,13 @@ void sched_runstate_process(struct pcpu_info *p) - last_oldstate = v->runstate.last_oldstate; - v->runstate.last_oldstate.wrong = RUNSTATE_INIT; - -+ /* Handle all "start-of-day" issues in one place. This can be -+ * done before any of the other tracks or sanity checks. */ -+ if ( v->runstate.state == RUNSTATE_INIT ) { -+ vcpu_start(p, v, sevt.old_runstate, sevt.new_runstate, ri->tsc); -+ return; -+ } -+ - /* Close vmexits when the putative reason for blocking / &c stops. - * This way, we don't account cpu contention to some other overhead. */ - if(sevt.new_runstate == RUNSTATE_RUNNABLE -@@ -7190,32 +7244,27 @@ update: - * or stopping actually running on a physical cpu. */ - if ( type == CONTINUE ) - { -- if( v->runstate.state == RUNSTATE_INIT ) { -- /* Start-of-day; account first tsc -> now to v */ -- vcpu_start(p, v); -- } else { -- /* Continue running. First, do some sanity checks */ -- if ( v->runstate.state == RUNSTATE_LOST ) { -- fprintf(warn, "WARNING: continue with d%dv%d in RUNSTATE_LOST. Resetting current.\n", -- v->d->did, v->vid); -- if ( p->current ) -- vcpu_prev_update(p, p->current, ri->tsc, RUNSTATE_LOST); -- vcpu_next_update(p, v, ri->tsc); -- } -- else if( v->runstate.state != RUNSTATE_RUNNING ) { -- /* This should never happen. */ -- fprintf(warn, "FATAL: sevt.old_runstate running, but d%dv%d runstate %s!\n", -- v->d->did, v->vid, runstate_name[v->runstate.state]); -- error(ERR_FILE, NULL); -- } else if ( v->p != p ) { -- fprintf(warn, "FATAL: continue on p%d, but d%dv%d p%d!\n", -- p->pid, v->d->did, v->vid, -- v->p ? v->p->pid : -1); -- error(ERR_FILE, NULL); -- } -- -- runstate_update(v, RUNSTATE_RUNNING, ri->tsc); -+ /* Continue running. First, do some sanity checks */ -+ if ( v->runstate.state == RUNSTATE_LOST ) { -+ fprintf(warn, "WARNING: continue with d%dv%d in RUNSTATE_LOST. Resetting current.\n", -+ v->d->did, v->vid); -+ if ( p->current ) -+ vcpu_prev_update(p, p->current, ri->tsc, RUNSTATE_LOST); -+ vcpu_next_update(p, v, ri->tsc); -+ } -+ else if( v->runstate.state != RUNSTATE_RUNNING ) { -+ /* This should never happen. */ -+ fprintf(warn, "FATAL: sevt.old_runstate running, but d%dv%d runstate %s!\n", -+ v->d->did, v->vid, runstate_name[v->runstate.state]); -+ error(ERR_FILE, NULL); -+ } else if ( v->p != p ) { -+ fprintf(warn, "FATAL: continue on p%d, but d%dv%d p%d!\n", -+ p->pid, v->d->did, v->vid, -+ v->p ? v->p->pid : -1); -+ error(ERR_FILE, NULL); - } -+ -+ runstate_update(v, RUNSTATE_RUNNING, ri->tsc); - } - else if ( sevt.old_runstate == RUNSTATE_RUNNING - || v->runstate.state == RUNSTATE_RUNNING ) -@@ -7232,10 +7281,7 @@ update: - * # (should never happen) - */ - if( sevt.old_runstate == RUNSTATE_RUNNING ) { -- if( v->runstate.state == RUNSTATE_INIT ) { -- /* Start-of-day; account first tsc -> now to v */ -- vcpu_start(p, v); -- } else if( v->runstate.state != RUNSTATE_RUNNING -+ if( v->runstate.state != RUNSTATE_RUNNING - && v->runstate.state != RUNSTATE_LOST ) { - /* This should never happen. */ - fprintf(warn, "FATAL: sevt.old_runstate running, but d%dv%d runstate %s!\n", -@@ -7264,11 +7310,8 @@ update: - - vcpu_next_update(p, v, ri->tsc); - } -- else if ( v->runstate.state != RUNSTATE_INIT ) -+ else - { -- /* TSC skew at start-of-day is hard to deal with. Don't -- * bring a vcpu out of INIT until it's seen to be actually -- * running somewhere. */ - runstate_update(v, sevt.new_runstate, ri->tsc); - } - --- -2.42.0 - diff --git a/0018-x86-ioapic-sanitize-IO-APIC-pins-before-enabling-lap.patch b/0018-x86-ioapic-sanitize-IO-APIC-pins-before-enabling-lap.patch deleted file mode 100644 index a03f86e..0000000 --- a/0018-x86-ioapic-sanitize-IO-APIC-pins-before-enabling-lap.patch +++ /dev/null @@ -1,113 +0,0 @@ -From d0cdd34dd815bf99c3f8a7bddfdde5ae59b0f0db Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Tue, 5 Sep 2023 08:47:34 +0200 -Subject: [PATCH 18/55] x86/ioapic: sanitize IO-APIC pins before enabling lapic - LVTERR/ESR -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -The current logic to init the local APIC and the IO-APIC does init the -local APIC LVTERR/ESR before doing any sanitization on the IO-APIC pin -configuration. It's already noted on enable_IO_APIC() that Xen -shouldn't trust the IO-APIC being empty at bootup. - -At XenServer we have a system where the IO-APIC 0 is handed to Xen -with pin 0 unmasked, set to Fixed delivery mode, edge triggered and -with a vector of 0 (all fields of the RTE are zeroed). Once the local -APIC LVTERR/ESR is enabled periodic injections from such pin cause the -local APIC to in turn inject periodic error vectors: - -APIC error on CPU0: 00(40), Received illegal vector -APIC error on CPU0: 40(40), Received illegal vector -APIC error on CPU0: 40(40), Received illegal vector -APIC error on CPU0: 40(40), Received illegal vector -APIC error on CPU0: 40(40), Received illegal vector -APIC error on CPU0: 40(40), Received illegal vector - -That prevents Xen from booting. - -Move the masking of the IO-APIC pins ahead of the setup of the local -APIC. This has the side effect of also moving the detection of the -pin where the i8259 is connected, as such detection must be done -before masking any pins. - -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -master commit: 813da5f0e73b8cbd2ac3c7922506e58c28cd736d -master date: 2023-07-17 10:31:10 +0200 ---- - xen/arch/x86/apic.c | 4 ++++ - xen/arch/x86/include/asm/irq.h | 1 + - xen/arch/x86/io_apic.c | 4 +--- - xen/arch/x86/smpboot.c | 5 +++++ - 4 files changed, 11 insertions(+), 3 deletions(-) - -diff --git a/xen/arch/x86/apic.c b/xen/arch/x86/apic.c -index 47e6e5fe41..33103d3e91 100644 ---- a/xen/arch/x86/apic.c -+++ b/xen/arch/x86/apic.c -@@ -1491,6 +1491,10 @@ int __init APIC_init_uniprocessor (void) - physids_clear(phys_cpu_present_map); - physid_set(boot_cpu_physical_apicid, phys_cpu_present_map); - -+ if ( !skip_ioapic_setup && nr_ioapics ) -+ /* Sanitize the IO-APIC pins before enabling the lapic LVTERR/ESR. */ -+ enable_IO_APIC(); -+ - setup_local_APIC(true); - - if (nmi_watchdog == NMI_LOCAL_APIC) -diff --git a/xen/arch/x86/include/asm/irq.h b/xen/arch/x86/include/asm/irq.h -index 76e6ed6d60..f6a0207a80 100644 ---- a/xen/arch/x86/include/asm/irq.h -+++ b/xen/arch/x86/include/asm/irq.h -@@ -122,6 +122,7 @@ bool bogus_8259A_irq(unsigned int irq); - int i8259A_suspend(void); - int i8259A_resume(void); - -+void enable_IO_APIC(void); - void setup_IO_APIC(void); - void disable_IO_APIC(void); - void setup_ioapic_dest(void); -diff --git a/xen/arch/x86/io_apic.c b/xen/arch/x86/io_apic.c -index 9b8a972cf5..25a08b1ea6 100644 ---- a/xen/arch/x86/io_apic.c -+++ b/xen/arch/x86/io_apic.c -@@ -1273,7 +1273,7 @@ static void cf_check _print_IO_APIC_keyhandler(unsigned char key) - __print_IO_APIC(0); - } - --static void __init enable_IO_APIC(void) -+void __init enable_IO_APIC(void) - { - int i8259_apic, i8259_pin; - int i, apic; -@@ -2067,8 +2067,6 @@ static void __init ioapic_pm_state_alloc(void) - - void __init setup_IO_APIC(void) - { -- enable_IO_APIC(); -- - if (acpi_ioapic) - io_apic_irqs = ~0; /* all IRQs go through IOAPIC */ - else -diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c -index b46fd9ab18..41ec3211ac 100644 ---- a/xen/arch/x86/smpboot.c -+++ b/xen/arch/x86/smpboot.c -@@ -1232,6 +1232,11 @@ void __init smp_prepare_cpus(void) - verify_local_APIC(); - - connect_bsp_APIC(); -+ -+ if ( !skip_ioapic_setup && nr_ioapics ) -+ /* Sanitize the IO-APIC pins before enabling the lapic LVTERR/ESR. */ -+ enable_IO_APIC(); -+ - setup_local_APIC(true); - - if ( !skip_ioapic_setup && nr_ioapics ) --- -2.42.0 - diff --git a/0019-x86-ioapic-add-a-raw-field-to-RTE-struct.patch b/0019-x86-ioapic-add-a-raw-field-to-RTE-struct.patch deleted file mode 100644 index 10e5946..0000000 --- a/0019-x86-ioapic-add-a-raw-field-to-RTE-struct.patch +++ /dev/null @@ -1,147 +0,0 @@ -From a885649098e06432939907eee84f735a644883e6 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Tue, 5 Sep 2023 08:48:43 +0200 -Subject: [PATCH 19/55] x86/ioapic: add a raw field to RTE struct -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Further changes will require access to the full RTE as a single value -in order to pass it to IOMMU interrupt remapping handlers. - -No functional change intended. - -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Acked-by: Jan Beulich <jbeulich@suse.com> -master commit: cdc48cb5a74b10c2b07a09d2f554756d730bfee3 -master date: 2023-07-28 09:39:44 +0200 ---- - xen/arch/x86/include/asm/io_apic.h | 57 +++++++++++++----------- - xen/arch/x86/io_apic.c | 2 +- - xen/drivers/passthrough/amd/iommu_intr.c | 4 +- - xen/drivers/passthrough/vtd/intremap.c | 4 +- - 4 files changed, 35 insertions(+), 32 deletions(-) - -diff --git a/xen/arch/x86/include/asm/io_apic.h b/xen/arch/x86/include/asm/io_apic.h -index ef0878b09e..a558bb063c 100644 ---- a/xen/arch/x86/include/asm/io_apic.h -+++ b/xen/arch/x86/include/asm/io_apic.h -@@ -89,35 +89,38 @@ enum ioapic_irq_destination_types { - }; - - struct IO_APIC_route_entry { -- unsigned int vector:8; -- unsigned int delivery_mode:3; /* -- * 000: FIXED -- * 001: lowest prio -- * 111: ExtINT -- */ -- unsigned int dest_mode:1; /* 0: physical, 1: logical */ -- unsigned int delivery_status:1; -- unsigned int polarity:1; /* 0: low, 1: high */ -- unsigned int irr:1; -- unsigned int trigger:1; /* 0: edge, 1: level */ -- unsigned int mask:1; /* 0: enabled, 1: disabled */ -- unsigned int __reserved_2:15; -- - union { - struct { -- unsigned int __reserved_1:24; -- unsigned int physical_dest:4; -- unsigned int __reserved_2:4; -- } physical; -- -- struct { -- unsigned int __reserved_1:24; -- unsigned int logical_dest:8; -- } logical; -- -- /* used when Interrupt Remapping with EIM is enabled */ -- unsigned int dest32; -- } dest; -+ unsigned int vector:8; -+ unsigned int delivery_mode:3; /* -+ * 000: FIXED -+ * 001: lowest prio -+ * 111: ExtINT -+ */ -+ unsigned int dest_mode:1; /* 0: physical, 1: logical */ -+ unsigned int delivery_status:1; -+ unsigned int polarity:1; /* 0: low, 1: high */ -+ unsigned int irr:1; -+ unsigned int trigger:1; /* 0: edge, 1: level */ -+ unsigned int mask:1; /* 0: enabled, 1: disabled */ -+ unsigned int __reserved_2:15; -+ -+ union { -+ struct { -+ unsigned int __reserved_1:24; -+ unsigned int physical_dest:4; -+ unsigned int __reserved_2:4; -+ } physical; -+ -+ struct { -+ unsigned int __reserved_1:24; -+ unsigned int logical_dest:8; -+ } logical; -+ unsigned int dest32; -+ } dest; -+ }; -+ uint64_t raw; -+ }; - }; - - /* -diff --git a/xen/arch/x86/io_apic.c b/xen/arch/x86/io_apic.c -index 25a08b1ea6..aada2ef96c 100644 ---- a/xen/arch/x86/io_apic.c -+++ b/xen/arch/x86/io_apic.c -@@ -2360,7 +2360,7 @@ int ioapic_guest_read(unsigned long physbase, unsigned int reg, u32 *pval) - int ioapic_guest_write(unsigned long physbase, unsigned int reg, u32 val) - { - int apic, pin, irq, ret, pirq; -- struct IO_APIC_route_entry rte = { 0 }; -+ struct IO_APIC_route_entry rte = { }; - unsigned long flags; - struct irq_desc *desc; - -diff --git a/xen/drivers/passthrough/amd/iommu_intr.c b/xen/drivers/passthrough/amd/iommu_intr.c -index f4de09f431..9e6be3be35 100644 ---- a/xen/drivers/passthrough/amd/iommu_intr.c -+++ b/xen/drivers/passthrough/amd/iommu_intr.c -@@ -352,8 +352,8 @@ static int update_intremap_entry_from_ioapic( - void cf_check amd_iommu_ioapic_update_ire( - unsigned int apic, unsigned int reg, unsigned int value) - { -- struct IO_APIC_route_entry old_rte = { 0 }; -- struct IO_APIC_route_entry new_rte = { 0 }; -+ struct IO_APIC_route_entry old_rte = { }; -+ struct IO_APIC_route_entry new_rte = { }; - unsigned int rte_lo = (reg & 1) ? reg - 1 : reg; - unsigned int pin = (reg - 0x10) / 2; - int seg, bdf, rc; -diff --git a/xen/drivers/passthrough/vtd/intremap.c b/xen/drivers/passthrough/vtd/intremap.c -index 1512e4866b..019c21c556 100644 ---- a/xen/drivers/passthrough/vtd/intremap.c -+++ b/xen/drivers/passthrough/vtd/intremap.c -@@ -419,7 +419,7 @@ unsigned int cf_check io_apic_read_remap_rte( - { - unsigned int ioapic_pin = (reg - 0x10) / 2; - int index; -- struct IO_xAPIC_route_entry old_rte = { 0 }; -+ struct IO_xAPIC_route_entry old_rte = { }; - int rte_upper = (reg & 1) ? 1 : 0; - struct vtd_iommu *iommu = ioapic_to_iommu(IO_APIC_ID(apic)); - -@@ -442,7 +442,7 @@ void cf_check io_apic_write_remap_rte( - unsigned int apic, unsigned int reg, unsigned int value) - { - unsigned int ioapic_pin = (reg - 0x10) / 2; -- struct IO_xAPIC_route_entry old_rte = { 0 }; -+ struct IO_xAPIC_route_entry old_rte = { }; - struct IO_APIC_route_remap_entry *remap_rte; - unsigned int rte_upper = (reg & 1) ? 1 : 0; - struct vtd_iommu *iommu = ioapic_to_iommu(IO_APIC_ID(apic)); --- -2.42.0 - diff --git a/0020-x86-ioapic-RTE-modifications-must-use-ioapic_write_e.patch b/0020-x86-ioapic-RTE-modifications-must-use-ioapic_write_e.patch deleted file mode 100644 index 43faeeb..0000000 --- a/0020-x86-ioapic-RTE-modifications-must-use-ioapic_write_e.patch +++ /dev/null @@ -1,180 +0,0 @@ -From 1bd4523d696d26976f64a919df8c7a1b3ea32f6f Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Tue, 5 Sep 2023 08:49:37 +0200 -Subject: [PATCH 20/55] x86/ioapic: RTE modifications must use - ioapic_write_entry -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Do not allow to write to RTE registers using io_apic_write and instead -require changes to RTE to be performed using ioapic_write_entry. - -This is in preparation for passing the full contents of the RTE to the -IOMMU interrupt remapping handlers, so remapping entries for IO-APIC -RTEs can be updated atomically when possible. - -While immediately this commit might expand the number of MMIO accesses -in order to update an IO-APIC RTE, further changes will benefit from -getting the full RTE value passed to the IOMMU handlers, as the logic -is greatly simplified when the IOMMU handlers can get the complete RTE -value in one go. - -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -master commit: ef7995ed1bcd7eac37fb3c3fe56eaa54ea9baf6c -master date: 2023-07-28 09:40:20 +0200 ---- - xen/arch/x86/include/asm/io_apic.h | 8 ++--- - xen/arch/x86/io_apic.c | 43 ++++++++++++------------ - xen/drivers/passthrough/amd/iommu_intr.c | 6 ---- - 3 files changed, 25 insertions(+), 32 deletions(-) - -diff --git a/xen/arch/x86/include/asm/io_apic.h b/xen/arch/x86/include/asm/io_apic.h -index a558bb063c..6b514b4e3d 100644 ---- a/xen/arch/x86/include/asm/io_apic.h -+++ b/xen/arch/x86/include/asm/io_apic.h -@@ -161,8 +161,8 @@ static inline void __io_apic_write(unsigned int apic, unsigned int reg, unsigned - - static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) - { -- if ( ioapic_reg_remapped(reg) ) -- return iommu_update_ire_from_apic(apic, reg, value); -+ /* RTE writes must use ioapic_write_entry. */ -+ BUG_ON(reg >= 0x10); - __io_apic_write(apic, reg, value); - } - -@@ -172,8 +172,8 @@ static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned i - */ - static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) - { -- if ( ioapic_reg_remapped(reg) ) -- return iommu_update_ire_from_apic(apic, reg, value); -+ /* RTE writes must use ioapic_write_entry. */ -+ BUG_ON(reg >= 0x10); - *(IO_APIC_BASE(apic) + 4) = value; - } - -diff --git a/xen/arch/x86/io_apic.c b/xen/arch/x86/io_apic.c -index aada2ef96c..041233b9b7 100644 ---- a/xen/arch/x86/io_apic.c -+++ b/xen/arch/x86/io_apic.c -@@ -237,15 +237,15 @@ struct IO_APIC_route_entry __ioapic_read_entry( - { - union entry_union eu; - -- if ( raw ) -+ if ( raw || !iommu_intremap ) - { - eu.w1 = __io_apic_read(apic, 0x10 + 2 * pin); - eu.w2 = __io_apic_read(apic, 0x11 + 2 * pin); - } - else - { -- eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); -- eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); -+ eu.w1 = iommu_read_apic_from_ire(apic, 0x10 + 2 * pin); -+ eu.w2 = iommu_read_apic_from_ire(apic, 0x11 + 2 * pin); - } - - return eu.entry; -@@ -269,15 +269,15 @@ void __ioapic_write_entry( - { - union entry_union eu = { .entry = e }; - -- if ( raw ) -+ if ( raw || !iommu_intremap ) - { - __io_apic_write(apic, 0x11 + 2 * pin, eu.w2); - __io_apic_write(apic, 0x10 + 2 * pin, eu.w1); - } - else - { -- io_apic_write(apic, 0x11 + 2 * pin, eu.w2); -- io_apic_write(apic, 0x10 + 2 * pin, eu.w1); -+ iommu_update_ire_from_apic(apic, 0x11 + 2 * pin, eu.w2); -+ iommu_update_ire_from_apic(apic, 0x10 + 2 * pin, eu.w1); - } - } - -@@ -433,16 +433,17 @@ static void modify_IO_APIC_irq(unsigned int irq, unsigned int enable, - unsigned int disable) - { - struct irq_pin_list *entry = irq_2_pin + irq; -- unsigned int pin, reg; - - for (;;) { -- pin = entry->pin; -+ unsigned int pin = entry->pin; -+ struct IO_APIC_route_entry rte; -+ - if (pin == -1) - break; -- reg = io_apic_read(entry->apic, 0x10 + pin*2); -- reg &= ~disable; -- reg |= enable; -- io_apic_modify(entry->apic, 0x10 + pin*2, reg); -+ rte = __ioapic_read_entry(entry->apic, pin, false); -+ rte.raw &= ~(uint64_t)disable; -+ rte.raw |= enable; -+ __ioapic_write_entry(entry->apic, pin, false, rte); - if (!entry->next) - break; - entry = irq_2_pin + entry->next; -@@ -584,16 +585,16 @@ set_ioapic_affinity_irq(struct irq_desc *desc, const cpumask_t *mask) - dest = SET_APIC_LOGICAL_ID(dest); - entry = irq_2_pin + irq; - for (;;) { -- unsigned int data; -+ struct IO_APIC_route_entry rte; -+ - pin = entry->pin; - if (pin == -1) - break; - -- io_apic_write(entry->apic, 0x10 + 1 + pin*2, dest); -- data = io_apic_read(entry->apic, 0x10 + pin*2); -- data &= ~IO_APIC_REDIR_VECTOR_MASK; -- data |= MASK_INSR(desc->arch.vector, IO_APIC_REDIR_VECTOR_MASK); -- io_apic_modify(entry->apic, 0x10 + pin*2, data); -+ rte = __ioapic_read_entry(entry->apic, pin, false); -+ rte.dest.dest32 = dest; -+ rte.vector = desc->arch.vector; -+ __ioapic_write_entry(entry->apic, pin, false, rte); - - if (!entry->next) - break; -@@ -2127,10 +2128,8 @@ void ioapic_resume(void) - reg_00.bits.ID = mp_ioapics[apic].mpc_apicid; - __io_apic_write(apic, 0, reg_00.raw); - } -- for (i = 0; i < nr_ioapic_entries[apic]; i++, entry++) { -- __io_apic_write(apic, 0x11+2*i, *(((int *)entry)+1)); -- __io_apic_write(apic, 0x10+2*i, *(((int *)entry)+0)); -- } -+ for (i = 0; i < nr_ioapic_entries[apic]; i++, entry++) -+ __ioapic_write_entry(apic, i, true, *entry); - } - spin_unlock_irqrestore(&ioapic_lock, flags); - } -diff --git a/xen/drivers/passthrough/amd/iommu_intr.c b/xen/drivers/passthrough/amd/iommu_intr.c -index 9e6be3be35..f32c418a7e 100644 ---- a/xen/drivers/passthrough/amd/iommu_intr.c -+++ b/xen/drivers/passthrough/amd/iommu_intr.c -@@ -361,12 +361,6 @@ void cf_check amd_iommu_ioapic_update_ire( - struct amd_iommu *iommu; - unsigned int idx; - -- if ( !iommu_intremap ) -- { -- __io_apic_write(apic, reg, value); -- return; -- } -- - idx = ioapic_id_to_index(IO_APIC_ID(apic)); - if ( idx == MAX_IO_APICS ) - return; --- -2.42.0 - diff --git a/0021-iommu-vtd-rename-io_apic_read_remap_rte-local-variab.patch b/0021-iommu-vtd-rename-io_apic_read_remap_rte-local-variab.patch deleted file mode 100644 index 6560452..0000000 --- a/0021-iommu-vtd-rename-io_apic_read_remap_rte-local-variab.patch +++ /dev/null @@ -1,64 +0,0 @@ -From e08e7330c58b7ee1efb00e348521a6afc524dc38 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Tue, 5 Sep 2023 08:50:05 +0200 -Subject: [PATCH 21/55] iommu/vtd: rename io_apic_read_remap_rte() local - variable -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Preparatory change to unify the IO-APIC pin variable name between -io_apic_read_remap_rte() and amd_iommu_ioapic_update_ire(), so that -the local variable can be made a function parameter with the same name -across vendors. - -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: Kevin Tian <kevin.tian@intel.com> -master commit: a478b38c01b65fa030303f0324a3380d872eb165 -master date: 2023-07-28 09:40:42 +0200 ---- - xen/drivers/passthrough/vtd/intremap.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/xen/drivers/passthrough/vtd/intremap.c b/xen/drivers/passthrough/vtd/intremap.c -index 019c21c556..53c9de9a75 100644 ---- a/xen/drivers/passthrough/vtd/intremap.c -+++ b/xen/drivers/passthrough/vtd/intremap.c -@@ -441,14 +441,14 @@ unsigned int cf_check io_apic_read_remap_rte( - void cf_check io_apic_write_remap_rte( - unsigned int apic, unsigned int reg, unsigned int value) - { -- unsigned int ioapic_pin = (reg - 0x10) / 2; -+ unsigned int pin = (reg - 0x10) / 2; - struct IO_xAPIC_route_entry old_rte = { }; - struct IO_APIC_route_remap_entry *remap_rte; - unsigned int rte_upper = (reg & 1) ? 1 : 0; - struct vtd_iommu *iommu = ioapic_to_iommu(IO_APIC_ID(apic)); - int saved_mask; - -- old_rte = __ioapic_read_entry(apic, ioapic_pin, true); -+ old_rte = __ioapic_read_entry(apic, pin, true); - - remap_rte = (struct IO_APIC_route_remap_entry *) &old_rte; - -@@ -458,7 +458,7 @@ void cf_check io_apic_write_remap_rte( - __io_apic_write(apic, reg & ~1, *(u32 *)&old_rte); - remap_rte->mask = saved_mask; - -- if ( ioapic_rte_to_remap_entry(iommu, apic, ioapic_pin, -+ if ( ioapic_rte_to_remap_entry(iommu, apic, pin, - &old_rte, rte_upper, value) ) - { - __io_apic_write(apic, reg, value); -@@ -468,7 +468,7 @@ void cf_check io_apic_write_remap_rte( - __io_apic_write(apic, reg & ~1, *(u32 *)&old_rte); - } - else -- __ioapic_write_entry(apic, ioapic_pin, true, old_rte); -+ __ioapic_write_entry(apic, pin, true, old_rte); - } - - static void set_msi_source_id(struct pci_dev *pdev, struct iremap_entry *ire) --- -2.42.0 - diff --git a/0022-x86-iommu-pass-full-IO-APIC-RTE-for-remapping-table-.patch b/0022-x86-iommu-pass-full-IO-APIC-RTE-for-remapping-table-.patch deleted file mode 100644 index e06714e..0000000 --- a/0022-x86-iommu-pass-full-IO-APIC-RTE-for-remapping-table-.patch +++ /dev/null @@ -1,462 +0,0 @@ -From 5116fe12d8238cc7d6582ceefd3f7e944bff9a1d Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Tue, 5 Sep 2023 08:50:39 +0200 -Subject: [PATCH 22/55] x86/iommu: pass full IO-APIC RTE for remapping table - update -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -So that the remapping entry can be updated atomically when possible. - -Doing such update atomically will avoid Xen having to mask the IO-APIC -pin prior to performing any interrupt movements (ie: changing the -destination and vector fields), as the interrupt remapping entry is -always consistent. - -This also simplifies some of the logic on both VT-d and AMD-Vi -implementations, as having the full RTE available instead of half of -it avoids to possibly read and update the missing other half from -hardware. - -While there remove the explicit zeroing of new_ire fields in -ioapic_rte_to_remap_entry() and initialize the variable at definition -so all fields are zeroed. Note fields could be also initialized with -final values at definition, but I found that likely too much to be -done at this time. - -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: Kevin Tian <kevin.tian@intel.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -master commit: 3e033172b0250446bfe119f31c7f0f51684b0472 -master date: 2023-08-01 11:48:39 +0200 ---- - xen/arch/x86/include/asm/iommu.h | 3 +- - xen/arch/x86/io_apic.c | 5 +- - xen/drivers/passthrough/amd/iommu.h | 2 +- - xen/drivers/passthrough/amd/iommu_intr.c | 100 ++--------------- - xen/drivers/passthrough/vtd/extern.h | 2 +- - xen/drivers/passthrough/vtd/intremap.c | 131 +++++++++++------------ - xen/drivers/passthrough/x86/iommu.c | 4 +- - xen/include/xen/iommu.h | 3 +- - 8 files changed, 82 insertions(+), 168 deletions(-) - -diff --git a/xen/arch/x86/include/asm/iommu.h b/xen/arch/x86/include/asm/iommu.h -index fc0afe35bf..c0d4ad3742 100644 ---- a/xen/arch/x86/include/asm/iommu.h -+++ b/xen/arch/x86/include/asm/iommu.h -@@ -97,7 +97,8 @@ struct iommu_init_ops { - - extern const struct iommu_init_ops *iommu_init_ops; - --void iommu_update_ire_from_apic(unsigned int apic, unsigned int reg, unsigned int value); -+void iommu_update_ire_from_apic(unsigned int apic, unsigned int pin, -+ uint64_t rte); - unsigned int iommu_read_apic_from_ire(unsigned int apic, unsigned int reg); - int iommu_setup_hpet_msi(struct msi_desc *); - -diff --git a/xen/arch/x86/io_apic.c b/xen/arch/x86/io_apic.c -index 041233b9b7..b3afef8933 100644 ---- a/xen/arch/x86/io_apic.c -+++ b/xen/arch/x86/io_apic.c -@@ -275,10 +275,7 @@ void __ioapic_write_entry( - __io_apic_write(apic, 0x10 + 2 * pin, eu.w1); - } - else -- { -- iommu_update_ire_from_apic(apic, 0x11 + 2 * pin, eu.w2); -- iommu_update_ire_from_apic(apic, 0x10 + 2 * pin, eu.w1); -- } -+ iommu_update_ire_from_apic(apic, pin, e.raw); - } - - static void ioapic_write_entry( -diff --git a/xen/drivers/passthrough/amd/iommu.h b/xen/drivers/passthrough/amd/iommu.h -index 8bc3c35b1b..5429ada58e 100644 ---- a/xen/drivers/passthrough/amd/iommu.h -+++ b/xen/drivers/passthrough/amd/iommu.h -@@ -300,7 +300,7 @@ int cf_check amd_iommu_free_intremap_table( - unsigned int amd_iommu_intremap_table_order( - const void *irt, const struct amd_iommu *iommu); - void cf_check amd_iommu_ioapic_update_ire( -- unsigned int apic, unsigned int reg, unsigned int value); -+ unsigned int apic, unsigned int pin, uint64_t rte); - unsigned int cf_check amd_iommu_read_ioapic_from_ire( - unsigned int apic, unsigned int reg); - int cf_check amd_iommu_msi_msg_update_ire( -diff --git a/xen/drivers/passthrough/amd/iommu_intr.c b/xen/drivers/passthrough/amd/iommu_intr.c -index f32c418a7e..e83a2a932a 100644 ---- a/xen/drivers/passthrough/amd/iommu_intr.c -+++ b/xen/drivers/passthrough/amd/iommu_intr.c -@@ -247,11 +247,6 @@ static void update_intremap_entry(const struct amd_iommu *iommu, - } - } - --static inline int get_rte_index(const struct IO_APIC_route_entry *rte) --{ -- return rte->vector | (rte->delivery_mode << 8); --} -- - static inline void set_rte_index(struct IO_APIC_route_entry *rte, int offset) - { - rte->vector = (u8)offset; -@@ -267,7 +262,6 @@ static int update_intremap_entry_from_ioapic( - int bdf, - struct amd_iommu *iommu, - struct IO_APIC_route_entry *rte, -- bool_t lo_update, - u16 *index) - { - unsigned long flags; -@@ -315,31 +309,6 @@ static int update_intremap_entry_from_ioapic( - spin_lock(lock); - } - -- if ( fresh ) -- /* nothing */; -- else if ( !lo_update ) -- { -- /* -- * Low half of incoming RTE is already in remapped format, -- * so need to recover vector and delivery mode from IRTE. -- */ -- ASSERT(get_rte_index(rte) == offset); -- if ( iommu->ctrl.ga_en ) -- vector = entry.ptr128->full.vector; -- else -- vector = entry.ptr32->flds.vector; -- /* The IntType fields match for both formats. */ -- delivery_mode = entry.ptr32->flds.int_type; -- } -- else if ( x2apic_enabled ) -- { -- /* -- * High half of incoming RTE was read from the I/O APIC and hence may -- * not hold the full destination, so need to recover full destination -- * from IRTE. -- */ -- dest = get_full_dest(entry.ptr128); -- } - update_intremap_entry(iommu, entry, vector, delivery_mode, dest_mode, dest); - - spin_unlock_irqrestore(lock, flags); -@@ -350,14 +319,11 @@ static int update_intremap_entry_from_ioapic( - } - - void cf_check amd_iommu_ioapic_update_ire( -- unsigned int apic, unsigned int reg, unsigned int value) -+ unsigned int apic, unsigned int pin, uint64_t rte) - { -- struct IO_APIC_route_entry old_rte = { }; -- struct IO_APIC_route_entry new_rte = { }; -- unsigned int rte_lo = (reg & 1) ? reg - 1 : reg; -- unsigned int pin = (reg - 0x10) / 2; -+ struct IO_APIC_route_entry old_rte; -+ struct IO_APIC_route_entry new_rte = { .raw = rte }; - int seg, bdf, rc; -- bool saved_mask, fresh = false; - struct amd_iommu *iommu; - unsigned int idx; - -@@ -373,58 +339,23 @@ void cf_check amd_iommu_ioapic_update_ire( - { - AMD_IOMMU_WARN("failed to find IOMMU for IO-APIC @ %04x:%04x\n", - seg, bdf); -- __io_apic_write(apic, reg, value); -+ __ioapic_write_entry(apic, pin, true, new_rte); - return; - } - -- /* save io-apic rte lower 32 bits */ -- *((u32 *)&old_rte) = __io_apic_read(apic, rte_lo); -- saved_mask = old_rte.mask; -- -- if ( reg == rte_lo ) -- { -- *((u32 *)&new_rte) = value; -- /* read upper 32 bits from io-apic rte */ -- *(((u32 *)&new_rte) + 1) = __io_apic_read(apic, reg + 1); -- } -- else -- { -- *((u32 *)&new_rte) = *((u32 *)&old_rte); -- *(((u32 *)&new_rte) + 1) = value; -- } -- -- if ( ioapic_sbdf[idx].pin_2_idx[pin] >= INTREMAP_MAX_ENTRIES ) -- { -- ASSERT(saved_mask); -- -- /* -- * There's nowhere except the IRTE to store a full 32-bit destination, -- * so we may not bypass entry allocation and updating of the low RTE -- * half in the (usual) case of the high RTE half getting written first. -- */ -- if ( new_rte.mask && !x2apic_enabled ) -- { -- __io_apic_write(apic, reg, value); -- return; -- } -- -- fresh = true; -- } -- -+ old_rte = __ioapic_read_entry(apic, pin, true); - /* mask the interrupt while we change the intremap table */ -- if ( !saved_mask ) -+ if ( !old_rte.mask ) - { - old_rte.mask = 1; -- __io_apic_write(apic, rte_lo, *((u32 *)&old_rte)); -+ __ioapic_write_entry(apic, pin, true, old_rte); - } - - /* Update interrupt remapping entry */ - rc = update_intremap_entry_from_ioapic( -- bdf, iommu, &new_rte, reg == rte_lo, -+ bdf, iommu, &new_rte, - &ioapic_sbdf[idx].pin_2_idx[pin]); - -- __io_apic_write(apic, reg, ((u32 *)&new_rte)[reg != rte_lo]); -- - if ( rc ) - { - /* Keep the entry masked. */ -@@ -433,20 +364,7 @@ void cf_check amd_iommu_ioapic_update_ire( - return; - } - -- /* For lower bits access, return directly to avoid double writes */ -- if ( reg == rte_lo ) -- return; -- -- /* -- * Unmask the interrupt after we have updated the intremap table. Also -- * write the low half if a fresh entry was allocated for a high half -- * update in x2APIC mode. -- */ -- if ( !saved_mask || (x2apic_enabled && fresh) ) -- { -- old_rte.mask = saved_mask; -- __io_apic_write(apic, rte_lo, *((u32 *)&old_rte)); -- } -+ __ioapic_write_entry(apic, pin, true, new_rte); - } - - unsigned int cf_check amd_iommu_read_ioapic_from_ire( -diff --git a/xen/drivers/passthrough/vtd/extern.h b/xen/drivers/passthrough/vtd/extern.h -index 39602d1f88..d49e40c5ce 100644 ---- a/xen/drivers/passthrough/vtd/extern.h -+++ b/xen/drivers/passthrough/vtd/extern.h -@@ -92,7 +92,7 @@ int cf_check intel_iommu_get_reserved_device_memory( - unsigned int cf_check io_apic_read_remap_rte( - unsigned int apic, unsigned int reg); - void cf_check io_apic_write_remap_rte( -- unsigned int apic, unsigned int reg, unsigned int value); -+ unsigned int apic, unsigned int pin, uint64_t rte); - - struct msi_desc; - struct msi_msg; -diff --git a/xen/drivers/passthrough/vtd/intremap.c b/xen/drivers/passthrough/vtd/intremap.c -index 53c9de9a75..78d7bc139a 100644 ---- a/xen/drivers/passthrough/vtd/intremap.c -+++ b/xen/drivers/passthrough/vtd/intremap.c -@@ -328,15 +328,14 @@ static int remap_entry_to_ioapic_rte( - - static int ioapic_rte_to_remap_entry(struct vtd_iommu *iommu, - int apic, unsigned int ioapic_pin, struct IO_xAPIC_route_entry *old_rte, -- unsigned int rte_upper, unsigned int value) -+ struct IO_xAPIC_route_entry new_rte) - { - struct iremap_entry *iremap_entry = NULL, *iremap_entries; - struct iremap_entry new_ire; - struct IO_APIC_route_remap_entry *remap_rte; -- struct IO_xAPIC_route_entry new_rte; - int index; - unsigned long flags; -- bool init = false; -+ bool init = false, masked = old_rte->mask; - - remap_rte = (struct IO_APIC_route_remap_entry *) old_rte; - spin_lock_irqsave(&iommu->intremap.lock, flags); -@@ -364,48 +363,40 @@ static int ioapic_rte_to_remap_entry(struct vtd_iommu *iommu, - - new_ire = *iremap_entry; - -- if ( rte_upper ) -- { -- if ( x2apic_enabled ) -- new_ire.remap.dst = value; -- else -- new_ire.remap.dst = (value >> 24) << 8; -- } -+ if ( x2apic_enabled ) -+ new_ire.remap.dst = new_rte.dest.dest32; - else -- { -- *(((u32 *)&new_rte) + 0) = value; -- new_ire.remap.fpd = 0; -- new_ire.remap.dm = new_rte.dest_mode; -- new_ire.remap.tm = new_rte.trigger; -- new_ire.remap.dlm = new_rte.delivery_mode; -- /* Hardware require RH = 1 for LPR delivery mode */ -- new_ire.remap.rh = (new_ire.remap.dlm == dest_LowestPrio); -- new_ire.remap.avail = 0; -- new_ire.remap.res_1 = 0; -- new_ire.remap.vector = new_rte.vector; -- new_ire.remap.res_2 = 0; -- -- set_ioapic_source_id(IO_APIC_ID(apic), &new_ire); -- new_ire.remap.res_3 = 0; -- new_ire.remap.res_4 = 0; -- new_ire.remap.p = 1; /* finally, set present bit */ -- -- /* now construct new ioapic rte entry */ -- remap_rte->vector = new_rte.vector; -- remap_rte->delivery_mode = 0; /* has to be 0 for remap format */ -- remap_rte->index_15 = (index >> 15) & 0x1; -- remap_rte->index_0_14 = index & 0x7fff; -- -- remap_rte->delivery_status = new_rte.delivery_status; -- remap_rte->polarity = new_rte.polarity; -- remap_rte->irr = new_rte.irr; -- remap_rte->trigger = new_rte.trigger; -- remap_rte->mask = new_rte.mask; -- remap_rte->reserved = 0; -- remap_rte->format = 1; /* indicate remap format */ -- } -- -- update_irte(iommu, iremap_entry, &new_ire, !init); -+ new_ire.remap.dst = GET_xAPIC_ID(new_rte.dest.dest32) << 8; -+ -+ new_ire.remap.dm = new_rte.dest_mode; -+ new_ire.remap.tm = new_rte.trigger; -+ new_ire.remap.dlm = new_rte.delivery_mode; -+ /* Hardware require RH = 1 for LPR delivery mode. */ -+ new_ire.remap.rh = (new_ire.remap.dlm == dest_LowestPrio); -+ new_ire.remap.vector = new_rte.vector; -+ -+ set_ioapic_source_id(IO_APIC_ID(apic), &new_ire); -+ /* Finally, set present bit. */ -+ new_ire.remap.p = 1; -+ -+ /* Now construct new ioapic rte entry. */ -+ remap_rte->vector = new_rte.vector; -+ /* Has to be 0 for remap format. */ -+ remap_rte->delivery_mode = 0; -+ remap_rte->index_15 = (index >> 15) & 0x1; -+ remap_rte->index_0_14 = index & 0x7fff; -+ -+ remap_rte->delivery_status = new_rte.delivery_status; -+ remap_rte->polarity = new_rte.polarity; -+ remap_rte->irr = new_rte.irr; -+ remap_rte->trigger = new_rte.trigger; -+ remap_rte->mask = new_rte.mask; -+ remap_rte->reserved = 0; -+ /* Indicate remap format. */ -+ remap_rte->format = 1; -+ -+ /* If cmpxchg16b is not available the caller must mask the IO-APIC pin. */ -+ update_irte(iommu, iremap_entry, &new_ire, !init && !masked); - iommu_sync_cache(iremap_entry, sizeof(*iremap_entry)); - iommu_flush_iec_index(iommu, 0, index); - -@@ -439,36 +430,42 @@ unsigned int cf_check io_apic_read_remap_rte( - } - - void cf_check io_apic_write_remap_rte( -- unsigned int apic, unsigned int reg, unsigned int value) -+ unsigned int apic, unsigned int pin, uint64_t rte) - { -- unsigned int pin = (reg - 0x10) / 2; -+ struct IO_xAPIC_route_entry new_rte = { .raw = rte }; - struct IO_xAPIC_route_entry old_rte = { }; -- struct IO_APIC_route_remap_entry *remap_rte; -- unsigned int rte_upper = (reg & 1) ? 1 : 0; - struct vtd_iommu *iommu = ioapic_to_iommu(IO_APIC_ID(apic)); -- int saved_mask; -- -- old_rte = __ioapic_read_entry(apic, pin, true); -- -- remap_rte = (struct IO_APIC_route_remap_entry *) &old_rte; -- -- /* mask the interrupt while we change the intremap table */ -- saved_mask = remap_rte->mask; -- remap_rte->mask = 1; -- __io_apic_write(apic, reg & ~1, *(u32 *)&old_rte); -- remap_rte->mask = saved_mask; -+ bool masked = true; -+ int rc; - -- if ( ioapic_rte_to_remap_entry(iommu, apic, pin, -- &old_rte, rte_upper, value) ) -+ if ( !cpu_has_cx16 ) - { -- __io_apic_write(apic, reg, value); -+ /* -+ * Cannot atomically update the IRTE entry: mask the IO-APIC pin to -+ * avoid interrupts seeing an inconsistent IRTE entry. -+ */ -+ old_rte = __ioapic_read_entry(apic, pin, true); -+ if ( !old_rte.mask ) -+ { -+ masked = false; -+ old_rte.mask = 1; -+ __ioapic_write_entry(apic, pin, true, old_rte); -+ } -+ } - -- /* Recover the original value of 'mask' bit */ -- if ( rte_upper ) -- __io_apic_write(apic, reg & ~1, *(u32 *)&old_rte); -+ rc = ioapic_rte_to_remap_entry(iommu, apic, pin, &old_rte, new_rte); -+ if ( rc ) -+ { -+ if ( !masked ) -+ { -+ /* Recover the original value of 'mask' bit */ -+ old_rte.mask = 0; -+ __ioapic_write_entry(apic, pin, true, old_rte); -+ } -+ return; - } -- else -- __ioapic_write_entry(apic, pin, true, old_rte); -+ /* old_rte will contain the updated IO-APIC RTE on success. */ -+ __ioapic_write_entry(apic, pin, true, old_rte); - } - - static void set_msi_source_id(struct pci_dev *pdev, struct iremap_entry *ire) -diff --git a/xen/drivers/passthrough/x86/iommu.c b/xen/drivers/passthrough/x86/iommu.c -index f671b0f2bb..8bd0ccb2e9 100644 ---- a/xen/drivers/passthrough/x86/iommu.c -+++ b/xen/drivers/passthrough/x86/iommu.c -@@ -142,9 +142,9 @@ int iommu_enable_x2apic(void) - } - - void iommu_update_ire_from_apic( -- unsigned int apic, unsigned int reg, unsigned int value) -+ unsigned int apic, unsigned int pin, uint64_t rte) - { -- iommu_vcall(&iommu_ops, update_ire_from_apic, apic, reg, value); -+ iommu_vcall(&iommu_ops, update_ire_from_apic, apic, pin, rte); - } - - unsigned int iommu_read_apic_from_ire(unsigned int apic, unsigned int reg) -diff --git a/xen/include/xen/iommu.h b/xen/include/xen/iommu.h -index 4f22fc1bed..f8a52627f7 100644 ---- a/xen/include/xen/iommu.h -+++ b/xen/include/xen/iommu.h -@@ -274,7 +274,8 @@ struct iommu_ops { - int (*enable_x2apic)(void); - void (*disable_x2apic)(void); - -- void (*update_ire_from_apic)(unsigned int apic, unsigned int reg, unsigned int value); -+ void (*update_ire_from_apic)(unsigned int apic, unsigned int pin, -+ uint64_t rte); - unsigned int (*read_apic_from_ire)(unsigned int apic, unsigned int reg); - - int (*setup_hpet_msi)(struct msi_desc *); --- -2.42.0 - diff --git a/0023-build-correct-gas-noexecstack-check.patch b/0023-build-correct-gas-noexecstack-check.patch deleted file mode 100644 index 245d631..0000000 --- a/0023-build-correct-gas-noexecstack-check.patch +++ /dev/null @@ -1,34 +0,0 @@ -From ba360fbb6413231f84a7d68f5cb34858f81d4d23 Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Tue, 5 Sep 2023 08:51:50 +0200 -Subject: [PATCH 23/55] build: correct gas --noexecstack check - -The check was missing an escape for the inner $, thus breaking things -in the unlikely event that the underlying assembler doesn't support this -option. - -Fixes: 62d22296a95d ("build: silence GNU ld warning about executable stacks") -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: Anthony PERARD <anthony.perard@citrix.com> -master commit: d1f6a58dfdc508c43a51c1865c826d519bf16493 -master date: 2023-08-14 09:58:19 +0200 ---- - xen/Makefile | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/xen/Makefile b/xen/Makefile -index 7bb9de7bdc..455916c757 100644 ---- a/xen/Makefile -+++ b/xen/Makefile -@@ -405,7 +405,7 @@ endif - - AFLAGS += -D__ASSEMBLY__ - --$(call cc-option-add,AFLAGS,CC,-Wa$(comma)--noexecstack) -+$(call cc-option-add,AFLAGS,CC,-Wa$$(comma)--noexecstack) - - LDFLAGS-$(call ld-option,--warn-rwx-segments) += --no-warn-rwx-segments - --- -2.42.0 - diff --git a/0024-libxl-slightly-correct-JSON-generation-of-CPU-policy.patch b/0024-libxl-slightly-correct-JSON-generation-of-CPU-policy.patch deleted file mode 100644 index 1ec7335..0000000 --- a/0024-libxl-slightly-correct-JSON-generation-of-CPU-policy.patch +++ /dev/null @@ -1,38 +0,0 @@ -From 042982297802e7b746dc2fac95a453cc88d0aa83 Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Tue, 5 Sep 2023 08:52:15 +0200 -Subject: [PATCH 24/55] libxl: slightly correct JSON generation of CPU policy - -The "cpuid_empty" label is also (in principle; maybe only for rubbish -input) reachable in the "cpuid_only" case. Hence the label needs to live -ahead of the check of the variable. - -Fixes: 5b80cecb747b ("libxl: introduce MSR data in libxl_cpuid_policy") -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: Anthony PERARD <anthony.perard@citrix.com> -master commit: ebce4e3a146c39e57bb7a890e059e89c32b6d547 -master date: 2023-08-17 16:24:17 +0200 ---- - tools/libs/light/libxl_cpuid.c | 3 ++- - 1 file changed, 2 insertions(+), 1 deletion(-) - -diff --git a/tools/libs/light/libxl_cpuid.c b/tools/libs/light/libxl_cpuid.c -index 849722541c..5c66d094b2 100644 ---- a/tools/libs/light/libxl_cpuid.c -+++ b/tools/libs/light/libxl_cpuid.c -@@ -710,10 +710,11 @@ parse_cpuid: - libxl__strdup(NOGC, libxl__json_object_get_string(r)); - } - } -+ -+cpuid_empty: - if (cpuid_only) - return 0; - --cpuid_empty: - co = libxl__json_map_get("msr", o, JSON_ARRAY); - if (!libxl__json_object_is_array(co)) - return ERROR_FAIL; --- -2.42.0 - diff --git a/0025-tboot-Disable-CET-at-shutdown.patch b/0025-tboot-Disable-CET-at-shutdown.patch deleted file mode 100644 index f06db61..0000000 --- a/0025-tboot-Disable-CET-at-shutdown.patch +++ /dev/null @@ -1,53 +0,0 @@ -From 7ca58fbef489fcb17631872a2bdc929823a2a494 Mon Sep 17 00:00:00 2001 -From: Jason Andryuk <jandryuk@gmail.com> -Date: Tue, 5 Sep 2023 08:52:33 +0200 -Subject: [PATCH 25/55] tboot: Disable CET at shutdown - -tboot_shutdown() calls into tboot to perform the actual system shutdown. -tboot isn't built with endbr annotations, and Xen has CET-IBT enabled on -newer hardware. shutdown_entry isn't annotated with endbr and Xen -faults: - -Panic on CPU 0: -CONTROL-FLOW PROTECTION FAULT: #CP[0003] endbranch - -And Xen hangs at this point. - -Disabling CET-IBT let Xen and tboot power off, but reboot was -perfoming a poweroff instead of a warm reboot. Disabling all of CET, -i.e. shadow stacks as well, lets tboot reboot properly. - -Fixes: cdbe2b0a1aec ("x86: Enable CET Indirect Branch Tracking") -Signed-off-by: Jason Andryuk <jandryuk@gmail.com> -Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Daniel P. Smith <dpsmith@apertussolutions.com> -master commit: 0801868f550539d417d46f82c49307480947ccaa -master date: 2023-08-17 16:24:49 +0200 ---- - xen/arch/x86/tboot.c | 10 ++++++++++ - 1 file changed, 10 insertions(+) - -diff --git a/xen/arch/x86/tboot.c b/xen/arch/x86/tboot.c -index fe1abfdf08..a2e9e97ed7 100644 ---- a/xen/arch/x86/tboot.c -+++ b/xen/arch/x86/tboot.c -@@ -398,6 +398,16 @@ void tboot_shutdown(uint32_t shutdown_type) - tboot_gen_xenheap_integrity(g_tboot_shared->s3_key, &xenheap_mac); - } - -+ /* -+ * Disable CET - tboot may not be built with endbr, and it doesn't support -+ * shadow stacks. -+ */ -+ if ( read_cr4() & X86_CR4_CET ) -+ { -+ wrmsrl(MSR_S_CET, 0); -+ write_cr4(read_cr4() & ~X86_CR4_CET); -+ } -+ - /* - * During early boot, we can be called by panic before idle_vcpu[0] is - * setup, but in that case we don't need to change page tables. --- -2.42.0 - diff --git a/0026-x86-svm-Fix-valid-condition-in-svm_get_pending_event.patch b/0026-x86-svm-Fix-valid-condition-in-svm_get_pending_event.patch deleted file mode 100644 index 10aa14f..0000000 --- a/0026-x86-svm-Fix-valid-condition-in-svm_get_pending_event.patch +++ /dev/null @@ -1,29 +0,0 @@ -From a939e953cdd522da3d8f0efeaea84448b5b570f9 Mon Sep 17 00:00:00 2001 -From: Jinoh Kang <jinoh.kang.kr@gmail.com> -Date: Tue, 5 Sep 2023 08:53:01 +0200 -Subject: [PATCH 26/55] x86/svm: Fix valid condition in svm_get_pending_event() - -Fixes: 9864841914c2 ("x86/vm_event: add support for VM_EVENT_REASON_INTERRUPT") -Signed-off-by: Jinoh Kang <jinoh.kang.kr@gmail.com> -master commit: b2865c2b6f164d2c379177cdd1cb200e4eaba549 -master date: 2023-08-18 20:21:44 +0100 ---- - xen/arch/x86/hvm/svm/svm.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c -index 5fa945c526..e8f50e7c5e 100644 ---- a/xen/arch/x86/hvm/svm/svm.c -+++ b/xen/arch/x86/hvm/svm/svm.c -@@ -2490,7 +2490,7 @@ static bool cf_check svm_get_pending_event( - { - const struct vmcb_struct *vmcb = v->arch.hvm.svm.vmcb; - -- if ( vmcb->event_inj.v ) -+ if ( !vmcb->event_inj.v ) - return false; - - info->vector = vmcb->event_inj.vector; --- -2.42.0 - diff --git a/0027-x86-vmx-Revert-x86-VMX-sanitize-rIP-before-re-enteri.patch b/0027-x86-vmx-Revert-x86-VMX-sanitize-rIP-before-re-enteri.patch deleted file mode 100644 index a022066..0000000 --- a/0027-x86-vmx-Revert-x86-VMX-sanitize-rIP-before-re-enteri.patch +++ /dev/null @@ -1,100 +0,0 @@ -From 8be85d8c0df2445c012fac42117396b483db5db0 Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Tue, 5 Sep 2023 08:53:31 +0200 -Subject: [PATCH 27/55] x86/vmx: Revert "x86/VMX: sanitize rIP before - re-entering guest" -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -At the time of XSA-170, the x86 instruction emulator was genuinely broken. It -would load arbitrary values into %rip and putting a check here probably was -the best stopgap security fix. It should have been reverted following c/s -81d3a0b26c1 "x86emul: limit-check branch targets" which corrected the emulator -behaviour. - -However, everyone involved in XSA-170, myself included, failed to read the SDM -correctly. On the subject of %rip consistency checks, the SDM stated: - - If the processor supports N < 64 linear-address bits, bits 63:N must be - identical - -A non-canonical %rip (and SSP more recently) is an explicitly legal state in -x86, and the VMEntry consistency checks are intentionally off-by-one from a -regular canonical check. - -The consequence of this bug is that Xen will currently take a legal x86 state -which would successfully VMEnter, and corrupt it into having non-architectural -behaviour. - -Furthermore, in the time this bugfix has been pending in public, I -successfully persuaded Intel to clarify the SDM, adding the following -clarification: - - The guest RIP value is not required to be canonical; the value of bit N-1 - may differ from that of bit N. - -Fixes: ffbbfda377 ("x86/VMX: sanitize rIP before re-entering guest") -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Acked-by: Roger Pau Monné <roger.pau@citrix.com> -master commit: 10c83bb0f5d158d101d983883741b76f927e54a3 -master date: 2023-08-23 18:44:59 +0100 ---- - xen/arch/x86/hvm/vmx/vmx.c | 34 +--------------------------------- - 1 file changed, 1 insertion(+), 33 deletions(-) - -diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c -index f256dc2635..072288a5ef 100644 ---- a/xen/arch/x86/hvm/vmx/vmx.c -+++ b/xen/arch/x86/hvm/vmx/vmx.c -@@ -3975,7 +3975,7 @@ static void undo_nmis_unblocked_by_iret(void) - void vmx_vmexit_handler(struct cpu_user_regs *regs) - { - unsigned long exit_qualification, exit_reason, idtv_info, intr_info = 0; -- unsigned int vector = 0, mode; -+ unsigned int vector = 0; - struct vcpu *v = current; - struct domain *currd = v->domain; - -@@ -4650,38 +4650,6 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs) - out: - if ( nestedhvm_vcpu_in_guestmode(v) ) - nvmx_idtv_handling(); -- -- /* -- * VM entry will fail (causing the guest to get crashed) if rIP (and -- * rFLAGS, but we don't have an issue there) doesn't meet certain -- * criteria. As we must not allow less than fully privileged mode to have -- * such an effect on the domain, we correct rIP in that case (accepting -- * this not being architecturally correct behavior, as the injected #GP -- * fault will then not see the correct [invalid] return address). -- * And since we know the guest will crash, we crash it right away if it -- * already is in most privileged mode. -- */ -- mode = vmx_guest_x86_mode(v); -- if ( mode == 8 ? !is_canonical_address(regs->rip) -- : regs->rip != regs->eip ) -- { -- gprintk(XENLOG_WARNING, "Bad rIP %lx for mode %u\n", regs->rip, mode); -- -- if ( vmx_get_cpl() ) -- { -- __vmread(VM_ENTRY_INTR_INFO, &intr_info); -- if ( !(intr_info & INTR_INFO_VALID_MASK) ) -- hvm_inject_hw_exception(TRAP_gp_fault, 0); -- /* Need to fix rIP nevertheless. */ -- if ( mode == 8 ) -- regs->rip = (long)(regs->rip << (64 - VADDR_BITS)) >> -- (64 - VADDR_BITS); -- else -- regs->rip = regs->eip; -- } -- else -- domain_crash(v->domain); -- } - } - - static void lbr_tsx_fixup(void) --- -2.42.0 - diff --git a/0028-x86-irq-fix-reporting-of-spurious-i8259-interrupts.patch b/0028-x86-irq-fix-reporting-of-spurious-i8259-interrupts.patch deleted file mode 100644 index 2fcfd68..0000000 --- a/0028-x86-irq-fix-reporting-of-spurious-i8259-interrupts.patch +++ /dev/null @@ -1,41 +0,0 @@ -From 699de512748d8e3bdcb3225b3b2a77c10cfd2408 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Tue, 5 Sep 2023 08:53:57 +0200 -Subject: [PATCH 28/55] x86/irq: fix reporting of spurious i8259 interrupts -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -The return value of bogus_8259A_irq() is wrong: the function will -return `true` when the IRQ is real and `false` when it's a spurious -IRQ. This causes the "No irq handler for vector ..." message in -do_IRQ() to be printed for spurious i8259 interrupts which is not -intended (and not helpful). - -Fix by inverting the return value of bogus_8259A_irq(). - -Fixes: 132906348a14 ('x86/i8259: Handle bogus spurious interrupts more quietly') -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -master commit: 709f6c8ce6422475c372e67507606170a31ccb65 -master date: 2023-08-30 10:03:53 +0200 ---- - xen/arch/x86/i8259.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/xen/arch/x86/i8259.c b/xen/arch/x86/i8259.c -index 6b35be10f0..ed9f55abe5 100644 ---- a/xen/arch/x86/i8259.c -+++ b/xen/arch/x86/i8259.c -@@ -37,7 +37,7 @@ static bool _mask_and_ack_8259A_irq(unsigned int irq); - - bool bogus_8259A_irq(unsigned int irq) - { -- return _mask_and_ack_8259A_irq(irq); -+ return !_mask_and_ack_8259A_irq(irq); - } - - static void cf_check mask_and_ack_8259A_irq(struct irq_desc *desc) --- -2.42.0 - diff --git a/0029-xen-arm-page-Handle-cache-flush-of-an-element-at-the.patch b/0029-xen-arm-page-Handle-cache-flush-of-an-element-at-the.patch deleted file mode 100644 index bc866d0..0000000 --- a/0029-xen-arm-page-Handle-cache-flush-of-an-element-at-the.patch +++ /dev/null @@ -1,111 +0,0 @@ -From d31e5b2a9c39816a954d1088d4cfc782f0006f39 Mon Sep 17 00:00:00 2001 -From: Stefano Stabellini <stefano.stabellini@amd.com> -Date: Tue, 5 Sep 2023 14:33:29 +0200 -Subject: [PATCH 29/55] xen/arm: page: Handle cache flush of an element at the - top of the address space - -The region that needs to be cleaned/invalidated may be at the top -of the address space. This means that 'end' (i.e. 'p + size') will -be 0 and therefore nothing will be cleaned/invalidated as the check -in the loop will always be false. - -On Arm64, we only support we only support up to 48-bit Virtual -address space. So this is not a concern there. However, for 32-bit, -the mapcache is using the last 2GB of the address space. Therefore -we may not clean/invalidate properly some pages. This could lead -to memory corruption or data leakage (the scrubbed value may -still sit in the cache when the guest could read directly the memory -and therefore read the old content). - -Rework invalidate_dcache_va_range(), clean_dcache_va_range(), -clean_and_invalidate_dcache_va_range() to handle a cache flush -with an element at the top of the address space. - -This is CVE-2023-34321 / XSA-437. - -Reported-by: Julien Grall <jgrall@amazon.com> -Signed-off-by: Stefano Stabellini <stefano.stabellini@amd.com> -Signed-off-by: Julien Grall <jgrall@amazon.com> -Acked-by: Bertrand Marquis <bertrand.marquis@arm.com> -master commit: 9a216e92de9f9011097e4f1fb55ff67ba0a21704 -master date: 2023-09-05 14:30:08 +0200 ---- - xen/arch/arm/include/asm/page.h | 33 ++++++++++++++++++++------------- - 1 file changed, 20 insertions(+), 13 deletions(-) - -diff --git a/xen/arch/arm/include/asm/page.h b/xen/arch/arm/include/asm/page.h -index e7cd62190c..d7fe770a5e 100644 ---- a/xen/arch/arm/include/asm/page.h -+++ b/xen/arch/arm/include/asm/page.h -@@ -160,26 +160,25 @@ static inline size_t read_dcache_line_bytes(void) - - static inline int invalidate_dcache_va_range(const void *p, unsigned long size) - { -- const void *end = p + size; - size_t cacheline_mask = dcache_line_bytes - 1; - - dsb(sy); /* So the CPU issues all writes to the range */ - - if ( (uintptr_t)p & cacheline_mask ) - { -+ size -= dcache_line_bytes - ((uintptr_t)p & cacheline_mask); - p = (void *)((uintptr_t)p & ~cacheline_mask); - asm volatile (__clean_and_invalidate_dcache_one(0) : : "r" (p)); - p += dcache_line_bytes; - } -- if ( (uintptr_t)end & cacheline_mask ) -- { -- end = (void *)((uintptr_t)end & ~cacheline_mask); -- asm volatile (__clean_and_invalidate_dcache_one(0) : : "r" (end)); -- } - -- for ( ; p < end; p += dcache_line_bytes ) -+ for ( ; size >= dcache_line_bytes; -+ p += dcache_line_bytes, size -= dcache_line_bytes ) - asm volatile (__invalidate_dcache_one(0) : : "r" (p)); - -+ if ( size > 0 ) -+ asm volatile (__clean_and_invalidate_dcache_one(0) : : "r" (p)); -+ - dsb(sy); /* So we know the flushes happen before continuing */ - - return 0; -@@ -187,10 +186,14 @@ static inline int invalidate_dcache_va_range(const void *p, unsigned long size) - - static inline int clean_dcache_va_range(const void *p, unsigned long size) - { -- const void *end = p + size; -+ size_t cacheline_mask = dcache_line_bytes - 1; -+ - dsb(sy); /* So the CPU issues all writes to the range */ -- p = (void *)((uintptr_t)p & ~(dcache_line_bytes - 1)); -- for ( ; p < end; p += dcache_line_bytes ) -+ size += (uintptr_t)p & cacheline_mask; -+ size = (size + cacheline_mask) & ~cacheline_mask; -+ p = (void *)((uintptr_t)p & ~cacheline_mask); -+ for ( ; size >= dcache_line_bytes; -+ p += dcache_line_bytes, size -= dcache_line_bytes ) - asm volatile (__clean_dcache_one(0) : : "r" (p)); - dsb(sy); /* So we know the flushes happen before continuing */ - /* ARM callers assume that dcache_* functions cannot fail. */ -@@ -200,10 +203,14 @@ static inline int clean_dcache_va_range(const void *p, unsigned long size) - static inline int clean_and_invalidate_dcache_va_range - (const void *p, unsigned long size) - { -- const void *end = p + size; -+ size_t cacheline_mask = dcache_line_bytes - 1; -+ - dsb(sy); /* So the CPU issues all writes to the range */ -- p = (void *)((uintptr_t)p & ~(dcache_line_bytes - 1)); -- for ( ; p < end; p += dcache_line_bytes ) -+ size += (uintptr_t)p & cacheline_mask; -+ size = (size + cacheline_mask) & ~cacheline_mask; -+ p = (void *)((uintptr_t)p & ~cacheline_mask); -+ for ( ; size >= dcache_line_bytes; -+ p += dcache_line_bytes, size -= dcache_line_bytes ) - asm volatile (__clean_and_invalidate_dcache_one(0) : : "r" (p)); - dsb(sy); /* So we know the flushes happen before continuing */ - /* ARM callers assume that dcache_* functions cannot fail. */ --- -2.42.0 - diff --git a/0030-x86-AMD-extend-Zenbleed-check-to-models-good-ucode-i.patch b/0030-x86-AMD-extend-Zenbleed-check-to-models-good-ucode-i.patch deleted file mode 100644 index 4581d03..0000000 --- a/0030-x86-AMD-extend-Zenbleed-check-to-models-good-ucode-i.patch +++ /dev/null @@ -1,48 +0,0 @@ -From d2d2dcae879c6cc05227c9620f0a772f35fe6886 Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Wed, 23 Aug 2023 09:26:36 +0200 -Subject: [PATCH 30/55] x86/AMD: extend Zenbleed check to models "good" ucode - isn't known for - -Reportedly the AMD Custom APU 0405 found on SteamDeck, models 0x90 and -0x91, (quoting the respective Linux commit) is similarly affected. Put -another instance of our Zen1 vs Zen2 distinction checks in -amd_check_zenbleed(), forcing use of the chickenbit irrespective of -ucode version (building upon real hardware never surfacing a version of -0xffffffff). - -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> -(cherry picked from commit 145a69c0944ac70cfcf9d247c85dee9e99d9d302) ---- - xen/arch/x86/cpu/amd.c | 13 ++++++++++--- - 1 file changed, 10 insertions(+), 3 deletions(-) - -diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c -index 3ea214fc2e..1bb3044be1 100644 ---- a/xen/arch/x86/cpu/amd.c -+++ b/xen/arch/x86/cpu/amd.c -@@ -909,10 +909,17 @@ void amd_check_zenbleed(void) - case 0xa0 ... 0xaf: good_rev = 0x08a00008; break; - default: - /* -- * With the Fam17h check above, parts getting here are Zen1. -- * They're not affected. -+ * With the Fam17h check above, most parts getting here are -+ * Zen1. They're not affected. Assume Zen2 ones making it -+ * here are affected regardless of microcode version. -+ * -+ * Zen1 vs Zen2 isn't a simple model number comparison, so use -+ * STIBP as a heuristic to distinguish. - */ -- return; -+ if (!boot_cpu_has(X86_FEATURE_AMD_STIBP)) -+ return; -+ good_rev = ~0U; -+ break; - } - - rdmsrl(MSR_AMD64_DE_CFG, val); --- -2.42.0 - diff --git a/0031-x86-spec-ctrl-Fix-confusion-between-SPEC_CTRL_EXIT_T.patch b/0031-x86-spec-ctrl-Fix-confusion-between-SPEC_CTRL_EXIT_T.patch deleted file mode 100644 index 10417ae..0000000 --- a/0031-x86-spec-ctrl-Fix-confusion-between-SPEC_CTRL_EXIT_T.patch +++ /dev/null @@ -1,74 +0,0 @@ -From dc28aba565f226f9bec24cfde993e78478acfb4e Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Tue, 12 Sep 2023 15:06:49 +0100 -Subject: [PATCH 31/55] x86/spec-ctrl: Fix confusion between - SPEC_CTRL_EXIT_TO_XEN{,_IST} - -c/s 3fffaf9c13e9 ("x86/entry: Avoid using alternatives in NMI/#MC paths") -dropped the only user, leaving behind the (incorrect) implication that Xen had -split exit paths. - -Delete the unused SPEC_CTRL_EXIT_TO_XEN and rename SPEC_CTRL_EXIT_TO_XEN_IST -to SPEC_CTRL_EXIT_TO_XEN for consistency. - -No functional change. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit 1c18d73774533a55ba9d1cbee8bdace03efdb5e7) ---- - xen/arch/x86/include/asm/spec_ctrl_asm.h | 10 ++-------- - xen/arch/x86/x86_64/entry.S | 2 +- - 2 files changed, 3 insertions(+), 9 deletions(-) - -diff --git a/xen/arch/x86/include/asm/spec_ctrl_asm.h b/xen/arch/x86/include/asm/spec_ctrl_asm.h -index f23bb105c5..e8fd01243c 100644 ---- a/xen/arch/x86/include/asm/spec_ctrl_asm.h -+++ b/xen/arch/x86/include/asm/spec_ctrl_asm.h -@@ -79,7 +79,6 @@ - * - SPEC_CTRL_ENTRY_FROM_PV - * - SPEC_CTRL_ENTRY_FROM_INTR - * - SPEC_CTRL_ENTRY_FROM_INTR_IST -- * - SPEC_CTRL_EXIT_TO_XEN_IST - * - SPEC_CTRL_EXIT_TO_XEN - * - SPEC_CTRL_EXIT_TO_PV - * -@@ -268,11 +267,6 @@ - ALTERNATIVE "", __stringify(DO_SPEC_CTRL_ENTRY maybexen=1), \ - X86_FEATURE_SC_MSR_PV - --/* Use when exiting to Xen context. */ --#define SPEC_CTRL_EXIT_TO_XEN \ -- ALTERNATIVE "", \ -- DO_SPEC_CTRL_EXIT_TO_XEN, X86_FEATURE_SC_MSR_PV -- - /* Use when exiting to PV guest context. */ - #define SPEC_CTRL_EXIT_TO_PV \ - ALTERNATIVE "", \ -@@ -339,8 +333,8 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise): - UNLIKELY_END(\@_serialise) - .endm - --/* Use when exiting to Xen in IST context. */ --.macro SPEC_CTRL_EXIT_TO_XEN_IST -+/* Use when exiting to Xen context. */ -+.macro SPEC_CTRL_EXIT_TO_XEN - /* - * Requires %rbx=stack_end - * Clobbers %rax, %rcx, %rdx -diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S -index 7675a59ff0..b45a09823a 100644 ---- a/xen/arch/x86/x86_64/entry.S -+++ b/xen/arch/x86/x86_64/entry.S -@@ -673,7 +673,7 @@ UNLIKELY_START(ne, exit_cr3) - UNLIKELY_END(exit_cr3) - - /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */ -- SPEC_CTRL_EXIT_TO_XEN_IST /* Req: %rbx=end, Clob: acd */ -+ SPEC_CTRL_EXIT_TO_XEN /* Req: %rbx=end, Clob: acd */ - - RESTORE_ALL adj=8 - iretq --- -2.42.0 - diff --git a/0032-x86-spec-ctrl-Fold-DO_SPEC_CTRL_EXIT_TO_XEN-into-it-.patch b/0032-x86-spec-ctrl-Fold-DO_SPEC_CTRL_EXIT_TO_XEN-into-it-.patch deleted file mode 100644 index a0c83da..0000000 --- a/0032-x86-spec-ctrl-Fold-DO_SPEC_CTRL_EXIT_TO_XEN-into-it-.patch +++ /dev/null @@ -1,85 +0,0 @@ -From 84690fb82c4f4aecb72a6789d8994efa74841e09 Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Tue, 12 Sep 2023 17:03:16 +0100 -Subject: [PATCH 32/55] x86/spec-ctrl: Fold DO_SPEC_CTRL_EXIT_TO_XEN into it's - single user - -With the SPEC_CTRL_EXIT_TO_XEN{,_IST} confusion fixed, it's now obvious that -there's only a single EXIT_TO_XEN path. Fold DO_SPEC_CTRL_EXIT_TO_XEN into -SPEC_CTRL_EXIT_TO_XEN to simplify further fixes. - -When merging labels, switch the name to .L\@_skip_sc_msr as "skip" on its own -is going to be too generic shortly. - -No functional change. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit 694bb0f280fd08a4377e36e32b84b5062def4de2) ---- - xen/arch/x86/include/asm/spec_ctrl_asm.h | 40 ++++++++++-------------- - 1 file changed, 16 insertions(+), 24 deletions(-) - -diff --git a/xen/arch/x86/include/asm/spec_ctrl_asm.h b/xen/arch/x86/include/asm/spec_ctrl_asm.h -index e8fd01243c..d5f65d80ea 100644 ---- a/xen/arch/x86/include/asm/spec_ctrl_asm.h -+++ b/xen/arch/x86/include/asm/spec_ctrl_asm.h -@@ -211,27 +211,6 @@ - wrmsr - .endm - --.macro DO_SPEC_CTRL_EXIT_TO_XEN --/* -- * Requires %rbx=stack_end -- * Clobbers %rax, %rcx, %rdx -- * -- * When returning to Xen context, look to see whether SPEC_CTRL shadowing is -- * in effect, and reload the shadow value. This covers race conditions which -- * exist with an NMI/MCE/etc hitting late in the return-to-guest path. -- */ -- xor %edx, %edx -- -- testb $SCF_use_shadow, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%rbx) -- jz .L\@_skip -- -- mov STACK_CPUINFO_FIELD(shadow_spec_ctrl)(%rbx), %eax -- mov $MSR_SPEC_CTRL, %ecx -- wrmsr -- --.L\@_skip: --.endm -- - .macro DO_SPEC_CTRL_EXIT_TO_GUEST - /* - * Requires %eax=spec_ctrl, %rsp=regs/cpuinfo -@@ -340,11 +319,24 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise): - * Clobbers %rax, %rcx, %rdx - */ - testb $SCF_ist_sc_msr, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%rbx) -- jz .L\@_skip -+ jz .L\@_skip_sc_msr - -- DO_SPEC_CTRL_EXIT_TO_XEN -+ /* -+ * When returning to Xen context, look to see whether SPEC_CTRL shadowing -+ * is in effect, and reload the shadow value. This covers race conditions -+ * which exist with an NMI/MCE/etc hitting late in the return-to-guest -+ * path. -+ */ -+ xor %edx, %edx - --.L\@_skip: -+ testb $SCF_use_shadow, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%rbx) -+ jz .L\@_skip_sc_msr -+ -+ mov STACK_CPUINFO_FIELD(shadow_spec_ctrl)(%rbx), %eax -+ mov $MSR_SPEC_CTRL, %ecx -+ wrmsr -+ -+.L\@_skip_sc_msr: - .endm - - #endif /* __ASSEMBLY__ */ --- -2.42.0 - diff --git a/0033-x86-spec-ctrl-Turn-the-remaining-SPEC_CTRL_-ENTRY-EX.patch b/0033-x86-spec-ctrl-Turn-the-remaining-SPEC_CTRL_-ENTRY-EX.patch deleted file mode 100644 index a278c5f..0000000 --- a/0033-x86-spec-ctrl-Turn-the-remaining-SPEC_CTRL_-ENTRY-EX.patch +++ /dev/null @@ -1,83 +0,0 @@ -From 3952c73bdbd05f0e666986fce633a591237b3c88 Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Fri, 1 Sep 2023 11:38:44 +0100 -Subject: [PATCH 33/55] x86/spec-ctrl: Turn the remaining - SPEC_CTRL_{ENTRY,EXIT}_* into asm macros - -These have grown more complex over time, with some already having been -converted. - -Provide full Requires/Clobbers comments, otherwise missing at this level of -indirection. - -No functional change. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit 7125429aafb9e3c9c88fc93001fc2300e0ac2cc8) ---- - xen/arch/x86/include/asm/spec_ctrl_asm.h | 37 ++++++++++++++++++------ - 1 file changed, 28 insertions(+), 9 deletions(-) - -diff --git a/xen/arch/x86/include/asm/spec_ctrl_asm.h b/xen/arch/x86/include/asm/spec_ctrl_asm.h -index d5f65d80ea..c6d5f2ad01 100644 ---- a/xen/arch/x86/include/asm/spec_ctrl_asm.h -+++ b/xen/arch/x86/include/asm/spec_ctrl_asm.h -@@ -231,26 +231,45 @@ - .endm - - /* Use after an entry from PV context (syscall/sysenter/int80/int82/etc). */ --#define SPEC_CTRL_ENTRY_FROM_PV \ -+.macro SPEC_CTRL_ENTRY_FROM_PV -+/* -+ * Requires %rsp=regs/cpuinfo, %rdx=0 -+ * Clobbers %rax, %rcx, %rdx -+ */ - ALTERNATIVE "", __stringify(DO_SPEC_CTRL_COND_IBPB maybexen=0), \ -- X86_FEATURE_IBPB_ENTRY_PV; \ -- ALTERNATIVE "", DO_OVERWRITE_RSB, X86_FEATURE_SC_RSB_PV; \ -+ X86_FEATURE_IBPB_ENTRY_PV -+ -+ ALTERNATIVE "", DO_OVERWRITE_RSB, X86_FEATURE_SC_RSB_PV -+ - ALTERNATIVE "", __stringify(DO_SPEC_CTRL_ENTRY maybexen=0), \ - X86_FEATURE_SC_MSR_PV -+.endm - - /* Use in interrupt/exception context. May interrupt Xen or PV context. */ --#define SPEC_CTRL_ENTRY_FROM_INTR \ -+.macro SPEC_CTRL_ENTRY_FROM_INTR -+/* -+ * Requires %rsp=regs, %r14=stack_end, %rdx=0 -+ * Clobbers %rax, %rcx, %rdx -+ */ - ALTERNATIVE "", __stringify(DO_SPEC_CTRL_COND_IBPB maybexen=1), \ -- X86_FEATURE_IBPB_ENTRY_PV; \ -- ALTERNATIVE "", DO_OVERWRITE_RSB, X86_FEATURE_SC_RSB_PV; \ -+ X86_FEATURE_IBPB_ENTRY_PV -+ -+ ALTERNATIVE "", DO_OVERWRITE_RSB, X86_FEATURE_SC_RSB_PV -+ - ALTERNATIVE "", __stringify(DO_SPEC_CTRL_ENTRY maybexen=1), \ - X86_FEATURE_SC_MSR_PV -+.endm - - /* Use when exiting to PV guest context. */ --#define SPEC_CTRL_EXIT_TO_PV \ -- ALTERNATIVE "", \ -- DO_SPEC_CTRL_EXIT_TO_GUEST, X86_FEATURE_SC_MSR_PV; \ -+.macro SPEC_CTRL_EXIT_TO_PV -+/* -+ * Requires %rax=spec_ctrl, %rsp=regs/info -+ * Clobbers %rcx, %rdx -+ */ -+ ALTERNATIVE "", DO_SPEC_CTRL_EXIT_TO_GUEST, X86_FEATURE_SC_MSR_PV -+ - DO_SPEC_CTRL_COND_VERW -+.endm - - /* - * Use in IST interrupt/exception context. May interrupt Xen or PV context. --- -2.42.0 - diff --git a/0034-x86-spec-ctrl-Improve-all-SPEC_CTRL_-ENTER-EXIT-_-co.patch b/0034-x86-spec-ctrl-Improve-all-SPEC_CTRL_-ENTER-EXIT-_-co.patch deleted file mode 100644 index f360cbd..0000000 --- a/0034-x86-spec-ctrl-Improve-all-SPEC_CTRL_-ENTER-EXIT-_-co.patch +++ /dev/null @@ -1,106 +0,0 @@ -From ba023e93d0b1e60b80251bf080bab694efb9f8e3 Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Wed, 30 Aug 2023 20:11:50 +0100 -Subject: [PATCH 34/55] x86/spec-ctrl: Improve all SPEC_CTRL_{ENTER,EXIT}_* - comments - -... to better explain how they're used. - -Doing so highlights that SPEC_CTRL_EXIT_TO_XEN is missing a VERW flush for the -corner case when e.g. an NMI hits late in an exit-to-guest path. - -Leave a TODO, which will be addressed in subsequent patches which arrange for -VERW flushing to be safe within SPEC_CTRL_EXIT_TO_XEN. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit 45f00557350dc7d0756551069803fc49c29184ca) ---- - xen/arch/x86/include/asm/spec_ctrl_asm.h | 36 ++++++++++++++++++++---- - 1 file changed, 31 insertions(+), 5 deletions(-) - -diff --git a/xen/arch/x86/include/asm/spec_ctrl_asm.h b/xen/arch/x86/include/asm/spec_ctrl_asm.h -index c6d5f2ad01..97c4db31cd 100644 ---- a/xen/arch/x86/include/asm/spec_ctrl_asm.h -+++ b/xen/arch/x86/include/asm/spec_ctrl_asm.h -@@ -230,7 +230,10 @@ - wrmsr - .endm - --/* Use after an entry from PV context (syscall/sysenter/int80/int82/etc). */ -+/* -+ * Used after an entry from PV context: SYSCALL, SYSENTER, INT, -+ * etc. There is always a guest speculation state in context. -+ */ - .macro SPEC_CTRL_ENTRY_FROM_PV - /* - * Requires %rsp=regs/cpuinfo, %rdx=0 -@@ -245,7 +248,11 @@ - X86_FEATURE_SC_MSR_PV - .endm - --/* Use in interrupt/exception context. May interrupt Xen or PV context. */ -+/* -+ * Used after an exception or maskable interrupt, hitting Xen or PV context. -+ * There will either be a guest speculation context, or (barring fatal -+ * exceptions) a well-formed Xen speculation context. -+ */ - .macro SPEC_CTRL_ENTRY_FROM_INTR - /* - * Requires %rsp=regs, %r14=stack_end, %rdx=0 -@@ -260,7 +267,10 @@ - X86_FEATURE_SC_MSR_PV - .endm - --/* Use when exiting to PV guest context. */ -+/* -+ * Used when exiting from any entry context, back to PV context. This -+ * includes from an IST entry which moved onto the primary stack. -+ */ - .macro SPEC_CTRL_EXIT_TO_PV - /* - * Requires %rax=spec_ctrl, %rsp=regs/info -@@ -272,7 +282,13 @@ - .endm - - /* -- * Use in IST interrupt/exception context. May interrupt Xen or PV context. -+ * Used after an IST entry hitting Xen or PV context. Special care is needed, -+ * because when hitting Xen context, there may not be a well-formed -+ * speculation context. (i.e. it can hit in the middle of -+ * SPEC_CTRL_{ENTRY,EXIT}_* regions.) -+ * -+ * An IST entry which hits PV context moves onto the primary stack and leaves -+ * via SPEC_CTRL_EXIT_TO_PV, *not* SPEC_CTRL_EXIT_TO_XEN. - */ - .macro SPEC_CTRL_ENTRY_FROM_INTR_IST - /* -@@ -331,7 +347,14 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise): - UNLIKELY_END(\@_serialise) - .endm - --/* Use when exiting to Xen context. */ -+/* -+ * Use when exiting from any entry context, back to Xen context. This -+ * includes returning to other SPEC_CTRL_{ENTRY,EXIT}_* regions with an -+ * incomplete speculation context. -+ * -+ * Because we might have interrupted Xen beyond SPEC_CTRL_EXIT_TO_$GUEST, we -+ * need to treat this as if it were an EXIT_TO_$GUEST case too. -+ */ - .macro SPEC_CTRL_EXIT_TO_XEN - /* - * Requires %rbx=stack_end -@@ -356,6 +379,9 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise): - wrmsr - - .L\@_skip_sc_msr: -+ -+ /* TODO VERW */ -+ - .endm - - #endif /* __ASSEMBLY__ */ --- -2.42.0 - diff --git a/0035-x86-entry-Adjust-restore_all_xen-to-hold-stack_end-i.patch b/0035-x86-entry-Adjust-restore_all_xen-to-hold-stack_end-i.patch deleted file mode 100644 index fe2acaf..0000000 --- a/0035-x86-entry-Adjust-restore_all_xen-to-hold-stack_end-i.patch +++ /dev/null @@ -1,74 +0,0 @@ -From 5f7efd47c8273fde972637d0360851802f76eca9 Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Wed, 13 Sep 2023 13:48:16 +0100 -Subject: [PATCH 35/55] x86/entry: Adjust restore_all_xen to hold stack_end in - %r14 - -All other SPEC_CTRL_{ENTRY,EXIT}_* helpers hold stack_end in %r14. Adjust it -for consistency. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit 7aa28849a1155d856e214e9a80a7e65fffdc3e58) ---- - xen/arch/x86/include/asm/spec_ctrl_asm.h | 8 ++++---- - xen/arch/x86/x86_64/entry.S | 8 ++++---- - 2 files changed, 8 insertions(+), 8 deletions(-) - -diff --git a/xen/arch/x86/include/asm/spec_ctrl_asm.h b/xen/arch/x86/include/asm/spec_ctrl_asm.h -index 97c4db31cd..66c706496f 100644 ---- a/xen/arch/x86/include/asm/spec_ctrl_asm.h -+++ b/xen/arch/x86/include/asm/spec_ctrl_asm.h -@@ -357,10 +357,10 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise): - */ - .macro SPEC_CTRL_EXIT_TO_XEN - /* -- * Requires %rbx=stack_end -+ * Requires %r14=stack_end - * Clobbers %rax, %rcx, %rdx - */ -- testb $SCF_ist_sc_msr, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%rbx) -+ testb $SCF_ist_sc_msr, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14) - jz .L\@_skip_sc_msr - - /* -@@ -371,10 +371,10 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise): - */ - xor %edx, %edx - -- testb $SCF_use_shadow, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%rbx) -+ testb $SCF_use_shadow, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14) - jz .L\@_skip_sc_msr - -- mov STACK_CPUINFO_FIELD(shadow_spec_ctrl)(%rbx), %eax -+ mov STACK_CPUINFO_FIELD(shadow_spec_ctrl)(%r14), %eax - mov $MSR_SPEC_CTRL, %ecx - wrmsr - -diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S -index b45a09823a..92279a225d 100644 ---- a/xen/arch/x86/x86_64/entry.S -+++ b/xen/arch/x86/x86_64/entry.S -@@ -665,15 +665,15 @@ restore_all_xen: - * Check whether we need to switch to the per-CPU page tables, in - * case we return to late PV exit code (from an NMI or #MC). - */ -- GET_STACK_END(bx) -- cmpb $0, STACK_CPUINFO_FIELD(use_pv_cr3)(%rbx) -+ GET_STACK_END(14) -+ cmpb $0, STACK_CPUINFO_FIELD(use_pv_cr3)(%r14) - UNLIKELY_START(ne, exit_cr3) -- mov STACK_CPUINFO_FIELD(pv_cr3)(%rbx), %rax -+ mov STACK_CPUINFO_FIELD(pv_cr3)(%r14), %rax - mov %rax, %cr3 - UNLIKELY_END(exit_cr3) - - /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */ -- SPEC_CTRL_EXIT_TO_XEN /* Req: %rbx=end, Clob: acd */ -+ SPEC_CTRL_EXIT_TO_XEN /* Req: %r14=end, Clob: acd */ - - RESTORE_ALL adj=8 - iretq --- -2.42.0 - diff --git a/0036-x86-entry-Track-the-IST-ness-of-an-entry-for-the-exi.patch b/0036-x86-entry-Track-the-IST-ness-of-an-entry-for-the-exi.patch deleted file mode 100644 index ba7ea21..0000000 --- a/0036-x86-entry-Track-the-IST-ness-of-an-entry-for-the-exi.patch +++ /dev/null @@ -1,109 +0,0 @@ -From e4a71bc0da0baf7464bb0d8e33053f330e5ea366 Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Wed, 13 Sep 2023 12:20:12 +0100 -Subject: [PATCH 36/55] x86/entry: Track the IST-ness of an entry for the exit - paths - -Use %r12 to hold an ist_exit boolean. This register is zero elsewhere in the -entry/exit asm, so it only needs setting in the IST path. - -As this is subtle and fragile, add check_ist_exit() to be used in debugging -builds to cross-check that the ist_exit boolean matches the entry vector. - -Write check_ist_exit() it in C, because it's debug only and the logic more -complicated than I care to maintain in asm. - -For now, we only need to use this signal in the exit-to-Xen path, but some -exit-to-guest paths happen in IST context too. Check the correctness in all -exit paths to avoid the logic bit-rotting. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit 21bdc25b05a0f8ab6bc73520a9ca01327360732c) - -x86/entry: Partially revert IST-exit checks - -The patch adding check_ist_exit() didn't account for the fact that -reset_stack_and_jump() is not an ABI-preserving boundary. The IST-ness in -%r12 doesn't survive into the next context, and is a stale value C. - -This shows up in Gitlab CI for the Clang build: - - https://gitlab.com/xen-project/people/andyhhp/xen/-/jobs/5112783827 - -and in OSSTest for GCC 8: - - http://logs.test-lab.xenproject.org/osstest/logs/183045/test-amd64-amd64-xl-qemuu-debianhvm-amd64/serial-pinot0.log - -There's no straightforward way to reconstruct the IST-exit-ness on the -exit-to-guest path after a context switch. For now, we only need IST-exit on -the return-to-Xen path. - -Fixes: 21bdc25b05a0 ("x86/entry: Track the IST-ness of an entry for the exit paths") -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit 9b57c800b79b96769ea3dcd6468578fa664d19f9) ---- - xen/arch/x86/traps.c | 13 +++++++++++++ - xen/arch/x86/x86_64/entry.S | 13 ++++++++++++- - 2 files changed, 25 insertions(+), 1 deletion(-) - -diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c -index d12004b1c6..e65cc60041 100644 ---- a/xen/arch/x86/traps.c -+++ b/xen/arch/x86/traps.c -@@ -2315,6 +2315,19 @@ void asm_domain_crash_synchronous(unsigned long addr) - do_softirq(); - } - -+#ifdef CONFIG_DEBUG -+void check_ist_exit(const struct cpu_user_regs *regs, bool ist_exit) -+{ -+ const unsigned int ist_mask = -+ (1U << X86_EXC_NMI) | (1U << X86_EXC_DB) | -+ (1U << X86_EXC_DF) | (1U << X86_EXC_MC); -+ uint8_t ev = regs->entry_vector; -+ bool is_ist = (ev < TRAP_nr) && ((1U << ev) & ist_mask); -+ -+ ASSERT(is_ist == ist_exit); -+} -+#endif -+ - /* - * Local variables: - * mode: C -diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S -index 92279a225d..4cebc4fbe3 100644 ---- a/xen/arch/x86/x86_64/entry.S -+++ b/xen/arch/x86/x86_64/entry.S -@@ -659,8 +659,15 @@ ENTRY(early_page_fault) - .section .text.entry, "ax", @progbits - - ALIGN --/* No special register assumptions. */ -+/* %r12=ist_exit */ - restore_all_xen: -+ -+#ifdef CONFIG_DEBUG -+ mov %rsp, %rdi -+ mov %r12, %rsi -+ call check_ist_exit -+#endif -+ - /* - * Check whether we need to switch to the per-CPU page tables, in - * case we return to late PV exit code (from an NMI or #MC). -@@ -1091,6 +1098,10 @@ handle_ist_exception: - .L_ist_dispatch_done: - mov %r15, STACK_CPUINFO_FIELD(xen_cr3)(%r14) - mov %bl, STACK_CPUINFO_FIELD(use_pv_cr3)(%r14) -+ -+ /* This is an IST exit */ -+ mov $1, %r12d -+ - cmpb $TRAP_nmi,UREGS_entry_vector(%rsp) - jne ret_from_intr - --- -2.42.0 - diff --git a/0037-x86-spec-ctrl-Issue-VERW-during-IST-exit-to-Xen.patch b/0037-x86-spec-ctrl-Issue-VERW-during-IST-exit-to-Xen.patch deleted file mode 100644 index 6580907..0000000 --- a/0037-x86-spec-ctrl-Issue-VERW-during-IST-exit-to-Xen.patch +++ /dev/null @@ -1,89 +0,0 @@ -From 2e2c3efcfc9f183674a8de6ed954ffbe7188b70d Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Wed, 13 Sep 2023 13:53:33 +0100 -Subject: [PATCH 37/55] x86/spec-ctrl: Issue VERW during IST exit to Xen - -There is a corner case where e.g. an NMI hitting an exit-to-guest path after -SPEC_CTRL_EXIT_TO_* would have run the entire NMI handler *after* the VERW -flush to scrub potentially sensitive data from uarch buffers. - -In order to compensate, issue VERW when exiting to Xen from an IST entry. - -SPEC_CTRL_EXIT_TO_XEN already has two reads of spec_ctrl_flags off the stack, -and we're about to add a third. Load the field into %ebx, and list the -register as clobbered. - -%r12 has been arranged to be the ist_exit signal, so add this as an input -dependency and use it to identify when to issue a VERW. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit 3ee6066bcd737756b0990d417d94eddc0b0d2585) ---- - xen/arch/x86/include/asm/spec_ctrl_asm.h | 20 +++++++++++++++----- - xen/arch/x86/x86_64/entry.S | 2 +- - 2 files changed, 16 insertions(+), 6 deletions(-) - -diff --git a/xen/arch/x86/include/asm/spec_ctrl_asm.h b/xen/arch/x86/include/asm/spec_ctrl_asm.h -index 66c706496f..28a75796e6 100644 ---- a/xen/arch/x86/include/asm/spec_ctrl_asm.h -+++ b/xen/arch/x86/include/asm/spec_ctrl_asm.h -@@ -357,10 +357,12 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise): - */ - .macro SPEC_CTRL_EXIT_TO_XEN - /* -- * Requires %r14=stack_end -- * Clobbers %rax, %rcx, %rdx -+ * Requires %r12=ist_exit, %r14=stack_end -+ * Clobbers %rax, %rbx, %rcx, %rdx - */ -- testb $SCF_ist_sc_msr, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14) -+ movzbl STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14), %ebx -+ -+ testb $SCF_ist_sc_msr, %bl - jz .L\@_skip_sc_msr - - /* -@@ -371,7 +373,7 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise): - */ - xor %edx, %edx - -- testb $SCF_use_shadow, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14) -+ testb $SCF_use_shadow, %bl - jz .L\@_skip_sc_msr - - mov STACK_CPUINFO_FIELD(shadow_spec_ctrl)(%r14), %eax -@@ -380,8 +382,16 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise): - - .L\@_skip_sc_msr: - -- /* TODO VERW */ -+ test %r12, %r12 -+ jz .L\@_skip_ist_exit -+ -+ /* Logically DO_SPEC_CTRL_COND_VERW but without the %rsp=cpuinfo dependency */ -+ testb $SCF_verw, %bl -+ jz .L\@_skip_verw -+ verw STACK_CPUINFO_FIELD(verw_sel)(%r14) -+.L\@_skip_verw: - -+.L\@_skip_ist_exit: - .endm - - #endif /* __ASSEMBLY__ */ -diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S -index 4cebc4fbe3..c12e011b4d 100644 ---- a/xen/arch/x86/x86_64/entry.S -+++ b/xen/arch/x86/x86_64/entry.S -@@ -680,7 +680,7 @@ UNLIKELY_START(ne, exit_cr3) - UNLIKELY_END(exit_cr3) - - /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */ -- SPEC_CTRL_EXIT_TO_XEN /* Req: %r14=end, Clob: acd */ -+ SPEC_CTRL_EXIT_TO_XEN /* Req: %r12=ist_exit %r14=end, Clob: abcd */ - - RESTORE_ALL adj=8 - iretq --- -2.42.0 - diff --git a/0038-x86-amd-Introduce-is_zen-1-2-_uarch-predicates.patch b/0038-x86-amd-Introduce-is_zen-1-2-_uarch-predicates.patch deleted file mode 100644 index 6f2cdcb..0000000 --- a/0038-x86-amd-Introduce-is_zen-1-2-_uarch-predicates.patch +++ /dev/null @@ -1,91 +0,0 @@ -From 19ee1e1faa32b79274b3484cb1170a5970f1e602 Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Fri, 15 Sep 2023 12:13:51 +0100 -Subject: [PATCH 38/55] x86/amd: Introduce is_zen{1,2}_uarch() predicates - -We already have 3 cases using STIBP as a Zen1/2 heuristic, and are about to -introduce a 4th. Wrap the heuristic into a pair of predicates rather than -opencoding it, and the explanation of the heuristic, at each usage site. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit de1d265001397f308c5c3c5d3ffc30e7ef8c0705) ---- - xen/arch/x86/cpu/amd.c | 18 ++++-------------- - xen/arch/x86/include/asm/amd.h | 11 +++++++++++ - 2 files changed, 15 insertions(+), 14 deletions(-) - -diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c -index 1bb3044be1..e94ba5a0e0 100644 ---- a/xen/arch/x86/cpu/amd.c -+++ b/xen/arch/x86/cpu/amd.c -@@ -855,15 +855,13 @@ void amd_set_legacy_ssbd(bool enable) - * non-branch instructions to be ignored. It is to be set unilaterally in - * newer microcode. - * -- * This chickenbit is something unrelated on Zen1, and Zen1 vs Zen2 isn't a -- * simple model number comparison, so use STIBP as a heuristic to separate the -- * two uarches in Fam17h(AMD)/18h(Hygon). -+ * This chickenbit is something unrelated on Zen1. - */ - void amd_init_spectral_chicken(void) - { - uint64_t val, chickenbit = 1 << 1; - -- if (cpu_has_hypervisor || !boot_cpu_has(X86_FEATURE_AMD_STIBP)) -+ if (cpu_has_hypervisor || !is_zen2_uarch()) - return; - - if (rdmsr_safe(MSR_AMD64_DE_CFG2, val) == 0 && !(val & chickenbit)) -@@ -912,11 +910,8 @@ void amd_check_zenbleed(void) - * With the Fam17h check above, most parts getting here are - * Zen1. They're not affected. Assume Zen2 ones making it - * here are affected regardless of microcode version. -- * -- * Zen1 vs Zen2 isn't a simple model number comparison, so use -- * STIBP as a heuristic to distinguish. - */ -- if (!boot_cpu_has(X86_FEATURE_AMD_STIBP)) -+ if (is_zen1_uarch()) - return; - good_rev = ~0U; - break; -@@ -1277,12 +1272,7 @@ static int __init cf_check zen2_c6_errata_check(void) - */ - s_time_t delta; - -- /* -- * Zen1 vs Zen2 isn't a simple model number comparison, so use STIBP as -- * a heuristic to separate the two uarches in Fam17h. -- */ -- if (cpu_has_hypervisor || boot_cpu_data.x86 != 0x17 || -- !boot_cpu_has(X86_FEATURE_AMD_STIBP)) -+ if (cpu_has_hypervisor || boot_cpu_data.x86 != 0x17 || !is_zen2_uarch()) - return 0; - - /* -diff --git a/xen/arch/x86/include/asm/amd.h b/xen/arch/x86/include/asm/amd.h -index a975d3de26..82324110ab 100644 ---- a/xen/arch/x86/include/asm/amd.h -+++ b/xen/arch/x86/include/asm/amd.h -@@ -140,6 +140,17 @@ - AMD_MODEL_RANGE(0x11, 0x0, 0x0, 0xff, 0xf), \ - AMD_MODEL_RANGE(0x12, 0x0, 0x0, 0xff, 0xf)) - -+/* -+ * The Zen1 and Zen2 microarchitectures are implemented by AMD (Fam17h) and -+ * Hygon (Fam18h) but without simple model number rules. Instead, use STIBP -+ * as a heuristic that distinguishes the two. -+ * -+ * The caller is required to perform the appropriate vendor/family checks -+ * first. -+ */ -+#define is_zen1_uarch() (!boot_cpu_has(X86_FEATURE_AMD_STIBP)) -+#define is_zen2_uarch() boot_cpu_has(X86_FEATURE_AMD_STIBP) -+ - struct cpuinfo_x86; - int cpu_has_amd_erratum(const struct cpuinfo_x86 *, int, ...); - --- -2.42.0 - diff --git a/0039-x86-spec-ctrl-Mitigate-the-Zen1-DIV-leakage.patch b/0039-x86-spec-ctrl-Mitigate-the-Zen1-DIV-leakage.patch deleted file mode 100644 index 4b23d12..0000000 --- a/0039-x86-spec-ctrl-Mitigate-the-Zen1-DIV-leakage.patch +++ /dev/null @@ -1,228 +0,0 @@ -From 9ac2f49f5fa3a5159409241d4f74fb0d721dd4c5 Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Wed, 30 Aug 2023 20:24:25 +0100 -Subject: [PATCH 39/55] x86/spec-ctrl: Mitigate the Zen1 DIV leakage - -In the Zen1 microarchitecure, there is one divider in the pipeline which -services uops from both threads. In the case of #DE, the latched result from -the previous DIV to execute will be forwarded speculatively. - -This is an interesting covert channel that allows two threads to communicate -without any system calls. In also allows userspace to obtain the result of -the most recent DIV instruction executed (even speculatively) in the core, -which can be from a higher privilege context. - -Scrub the result from the divider by executing a non-faulting divide. This -needs performing on the exit-to-guest paths, and ist_exit-to-Xen. - -Alternatives in IST context is believed safe now that it's done in NMI -context. - -This is XSA-439 / CVE-2023-20588. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit b5926c6ecf05c28ee99c6248c42d691ccbf0c315) ---- - docs/misc/xen-command-line.pandoc | 6 ++- - xen/arch/x86/hvm/svm/entry.S | 1 + - xen/arch/x86/include/asm/cpufeatures.h | 2 +- - xen/arch/x86/include/asm/spec_ctrl_asm.h | 17 +++++++++ - xen/arch/x86/spec_ctrl.c | 48 +++++++++++++++++++++++- - 5 files changed, 71 insertions(+), 3 deletions(-) - -diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc -index d9dae740cc..b92c8f969c 100644 ---- a/docs/misc/xen-command-line.pandoc -+++ b/docs/misc/xen-command-line.pandoc -@@ -2315,7 +2315,7 @@ By default SSBD will be mitigated at runtime (i.e `ssbd=runtime`). - > {msr-sc,rsb,md-clear,ibpb-entry}=<bool>|{pv,hvm}=<bool>, - > bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,psfd, - > eager-fpu,l1d-flush,branch-harden,srb-lock, --> unpriv-mmio,gds-mit}=<bool> ]` -+> unpriv-mmio,gds-mit,div-scrub}=<bool> ]` - - Controls for speculative execution sidechannel mitigations. By default, Xen - will pick the most appropriate mitigations based on compiled in support, -@@ -2437,6 +2437,10 @@ has elected not to lock the configuration, Xen will use GDS_CTRL to mitigate - GDS with. Otherwise, Xen will mitigate by disabling AVX, which blocks the use - of the AVX2 Gather instructions. - -+On all hardware, the `div-scrub=` option can be used to force or prevent Xen -+from mitigating the DIV-leakage vulnerability. By default, Xen will mitigate -+DIV-leakage on hardware believed to be vulnerable. -+ - ### sync_console - > `= <boolean>` - -diff --git a/xen/arch/x86/hvm/svm/entry.S b/xen/arch/x86/hvm/svm/entry.S -index 981cd82e7c..934f12cf5c 100644 ---- a/xen/arch/x86/hvm/svm/entry.S -+++ b/xen/arch/x86/hvm/svm/entry.S -@@ -74,6 +74,7 @@ __UNLIKELY_END(nsvm_hap) - 1: /* No Spectre v1 concerns. Execution will hit VMRUN imminently. */ - .endm - ALTERNATIVE "", svm_vmentry_spec_ctrl, X86_FEATURE_SC_MSR_HVM -+ ALTERNATIVE "", DO_SPEC_CTRL_DIV, X86_FEATURE_SC_DIV - - pop %r15 - pop %r14 -diff --git a/xen/arch/x86/include/asm/cpufeatures.h b/xen/arch/x86/include/asm/cpufeatures.h -index da0593de85..c3aad21c3b 100644 ---- a/xen/arch/x86/include/asm/cpufeatures.h -+++ b/xen/arch/x86/include/asm/cpufeatures.h -@@ -35,7 +35,7 @@ XEN_CPUFEATURE(SC_RSB_HVM, X86_SYNTH(19)) /* RSB overwrite needed for HVM - XEN_CPUFEATURE(XEN_SELFSNOOP, X86_SYNTH(20)) /* SELFSNOOP gets used by Xen itself */ - XEN_CPUFEATURE(SC_MSR_IDLE, X86_SYNTH(21)) /* Clear MSR_SPEC_CTRL on idle */ - XEN_CPUFEATURE(XEN_LBR, X86_SYNTH(22)) /* Xen uses MSR_DEBUGCTL.LBR */ --/* Bits 23 unused. */ -+XEN_CPUFEATURE(SC_DIV, X86_SYNTH(23)) /* DIV scrub needed */ - XEN_CPUFEATURE(SC_RSB_IDLE, X86_SYNTH(24)) /* RSB overwrite needed for idle. */ - XEN_CPUFEATURE(SC_VERW_IDLE, X86_SYNTH(25)) /* VERW used by Xen for idle */ - XEN_CPUFEATURE(XEN_SHSTK, X86_SYNTH(26)) /* Xen uses CET Shadow Stacks */ -diff --git a/xen/arch/x86/include/asm/spec_ctrl_asm.h b/xen/arch/x86/include/asm/spec_ctrl_asm.h -index 28a75796e6..f4b8b9d956 100644 ---- a/xen/arch/x86/include/asm/spec_ctrl_asm.h -+++ b/xen/arch/x86/include/asm/spec_ctrl_asm.h -@@ -177,6 +177,19 @@ - .L\@_verw_skip: - .endm - -+.macro DO_SPEC_CTRL_DIV -+/* -+ * Requires nothing -+ * Clobbers %rax -+ * -+ * Issue a DIV for its flushing side effect (Zen1 uarch specific). Any -+ * non-faulting DIV will do; a byte DIV has least latency, and doesn't clobber -+ * %rdx. -+ */ -+ mov $1, %eax -+ div %al -+.endm -+ - .macro DO_SPEC_CTRL_ENTRY maybexen:req - /* - * Requires %rsp=regs (also cpuinfo if !maybexen) -@@ -279,6 +292,8 @@ - ALTERNATIVE "", DO_SPEC_CTRL_EXIT_TO_GUEST, X86_FEATURE_SC_MSR_PV - - DO_SPEC_CTRL_COND_VERW -+ -+ ALTERNATIVE "", DO_SPEC_CTRL_DIV, X86_FEATURE_SC_DIV - .endm - - /* -@@ -391,6 +406,8 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise): - verw STACK_CPUINFO_FIELD(verw_sel)(%r14) - .L\@_skip_verw: - -+ ALTERNATIVE "", DO_SPEC_CTRL_DIV, X86_FEATURE_SC_DIV -+ - .L\@_skip_ist_exit: - .endm - -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index 79b98f0fe7..0ff3c895ac 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -79,6 +79,7 @@ static int8_t __initdata opt_srb_lock = -1; - static bool __initdata opt_unpriv_mmio; - static bool __ro_after_init opt_fb_clear_mmio; - static int8_t __initdata opt_gds_mit = -1; -+static int8_t __initdata opt_div_scrub = -1; - - static int __init cf_check parse_spec_ctrl(const char *s) - { -@@ -133,6 +134,7 @@ static int __init cf_check parse_spec_ctrl(const char *s) - opt_srb_lock = 0; - opt_unpriv_mmio = false; - opt_gds_mit = 0; -+ opt_div_scrub = 0; - } - else if ( val > 0 ) - rc = -EINVAL; -@@ -285,6 +287,8 @@ static int __init cf_check parse_spec_ctrl(const char *s) - opt_unpriv_mmio = val; - else if ( (val = parse_boolean("gds-mit", s, ss)) >= 0 ) - opt_gds_mit = val; -+ else if ( (val = parse_boolean("div-scrub", s, ss)) >= 0 ) -+ opt_div_scrub = val; - else - rc = -EINVAL; - -@@ -485,7 +489,7 @@ static void __init print_details(enum ind_thunk thunk) - "\n"); - - /* Settings for Xen's protection, irrespective of guests. */ -- printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s%s%s%s, Other:%s%s%s%s%s\n", -+ printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s%s%s%s, Other:%s%s%s%s%s%s\n", - thunk == THUNK_NONE ? "N/A" : - thunk == THUNK_RETPOLINE ? "RETPOLINE" : - thunk == THUNK_LFENCE ? "LFENCE" : -@@ -510,6 +514,7 @@ static void __init print_details(enum ind_thunk thunk) - opt_l1d_flush ? " L1D_FLUSH" : "", - opt_md_clear_pv || opt_md_clear_hvm || - opt_fb_clear_mmio ? " VERW" : "", -+ opt_div_scrub ? " DIV" : "", - opt_branch_harden ? " BRANCH_HARDEN" : ""); - - /* L1TF diagnostics, printed if vulnerable or PV shadowing is in use. */ -@@ -967,6 +972,45 @@ static void __init srso_calculations(bool hw_smt_enabled) - setup_force_cpu_cap(X86_FEATURE_SRSO_NO); - } - -+/* -+ * The Div leakage issue is specific to the AMD Zen1 microarchitecure. -+ * -+ * However, there's no $FOO_NO bit defined, so if we're virtualised we have no -+ * hope of spotting the case where we might move to vulnerable hardware. We -+ * also can't make any useful conclusion about SMT-ness. -+ * -+ * Don't check the hypervisor bit, so at least we do the safe thing when -+ * booting on something that looks like a Zen1 CPU. -+ */ -+static bool __init has_div_vuln(void) -+{ -+ if ( !(boot_cpu_data.x86_vendor & -+ (X86_VENDOR_AMD | X86_VENDOR_HYGON)) ) -+ return false; -+ -+ if ( boot_cpu_data.x86 != 0x17 && boot_cpu_data.x86 != 0x18 ) -+ return false; -+ -+ return is_zen1_uarch(); -+} -+ -+static void __init div_calculations(bool hw_smt_enabled) -+{ -+ bool cpu_bug_div = has_div_vuln(); -+ -+ if ( opt_div_scrub == -1 ) -+ opt_div_scrub = cpu_bug_div; -+ -+ if ( opt_div_scrub ) -+ setup_force_cpu_cap(X86_FEATURE_SC_DIV); -+ -+ if ( opt_smt == -1 && !cpu_has_hypervisor && cpu_bug_div && hw_smt_enabled ) -+ warning_add( -+ "Booted on leaky-DIV hardware with SMT/Hyperthreading\n" -+ "enabled. Please assess your configuration and choose an\n" -+ "explicit 'smt=<bool>' setting. See XSA-439.\n"); -+} -+ - static void __init ibpb_calculations(void) - { - bool def_ibpb_entry = false; -@@ -1726,6 +1770,8 @@ void __init init_speculation_mitigations(void) - - ibpb_calculations(); - -+ div_calculations(hw_smt_enabled); -+ - /* Check whether Eager FPU should be enabled by default. */ - if ( opt_eager_fpu == -1 ) - opt_eager_fpu = should_use_eager_fpu(); --- -2.42.0 - diff --git a/0040-x86-shadow-defer-releasing-of-PV-s-top-level-shadow-.patch b/0040-x86-shadow-defer-releasing-of-PV-s-top-level-shadow-.patch deleted file mode 100644 index 21fb16f..0000000 --- a/0040-x86-shadow-defer-releasing-of-PV-s-top-level-shadow-.patch +++ /dev/null @@ -1,455 +0,0 @@ -From 90c540c58985dc774cf0a1d2dc423473d3f37267 Mon Sep 17 00:00:00 2001 -From: Jan Beulich <JBeulich@suse.com> -Date: Wed, 20 Sep 2023 10:33:26 +0100 -Subject: [PATCH 40/55] x86/shadow: defer releasing of PV's top-level shadow - reference - -sh_set_toplevel_shadow() re-pinning the top-level shadow we may be -running on is not enough (and at the same time unnecessary when the -shadow isn't what we're running on): That shadow becomes eligible for -blowing away (from e.g. shadow_prealloc()) immediately after the -paging lock was dropped. Yet it needs to remain valid until the actual -page table switch occurred. - -Propagate up the call chain the shadow entry that needs releasing -eventually, and carry out the release immediately after switching page -tables. Handle update_cr3() failures by switching to idle pagetables. -Note that various further uses of update_cr3() are HVM-only or only act -on paused vCPU-s, in which case sh_set_toplevel_shadow() will not defer -releasing of the reference. - -While changing the update_cr3() hook, also convert the "do_locking" -parameter to boolean. - -This is CVE-2023-34322 / XSA-438. - -Reported-by: Tim Deegan <tim@xen.org> -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: George Dunlap <george.dunlap@cloud.com> -(cherry picked from commit fb0ff49fe9f784bfee0370c2a3c5f20e39d7a1cb) ---- - xen/arch/x86/include/asm/mm.h | 2 +- - xen/arch/x86/include/asm/paging.h | 6 ++-- - xen/arch/x86/include/asm/shadow.h | 8 +++++ - xen/arch/x86/mm.c | 27 +++++++++++---- - xen/arch/x86/mm/hap/hap.c | 6 ++-- - xen/arch/x86/mm/shadow/common.c | 55 ++++++++++++++++++++----------- - xen/arch/x86/mm/shadow/multi.c | 33 ++++++++++++------- - xen/arch/x86/mm/shadow/none.c | 4 ++- - xen/arch/x86/mm/shadow/private.h | 14 ++++---- - xen/arch/x86/pv/domain.c | 25 ++++++++++++-- - 10 files changed, 127 insertions(+), 53 deletions(-) - -diff --git a/xen/arch/x86/include/asm/mm.h b/xen/arch/x86/include/asm/mm.h -index d723c7c38f..a5d7fdd32e 100644 ---- a/xen/arch/x86/include/asm/mm.h -+++ b/xen/arch/x86/include/asm/mm.h -@@ -552,7 +552,7 @@ void audit_domains(void); - #endif - - void make_cr3(struct vcpu *v, mfn_t mfn); --void update_cr3(struct vcpu *v); -+pagetable_t update_cr3(struct vcpu *v); - int vcpu_destroy_pagetables(struct vcpu *); - void *do_page_walk(struct vcpu *v, unsigned long addr); - -diff --git a/xen/arch/x86/include/asm/paging.h b/xen/arch/x86/include/asm/paging.h -index 6f7000d5f4..94c590f31a 100644 ---- a/xen/arch/x86/include/asm/paging.h -+++ b/xen/arch/x86/include/asm/paging.h -@@ -138,7 +138,7 @@ struct paging_mode { - paddr_t ga, uint32_t *pfec, - unsigned int *page_order); - #endif -- void (*update_cr3 )(struct vcpu *v, int do_locking, -+ pagetable_t (*update_cr3 )(struct vcpu *v, bool do_locking, - bool noflush); - void (*update_paging_modes )(struct vcpu *v); - bool (*flush_tlb )(const unsigned long *vcpu_bitmap); -@@ -310,9 +310,9 @@ static inline unsigned long paging_ga_to_gfn_cr3(struct vcpu *v, - /* Update all the things that are derived from the guest's CR3. - * Called when the guest changes CR3; the caller can then use v->arch.cr3 - * as the value to load into the host CR3 to schedule this vcpu */ --static inline void paging_update_cr3(struct vcpu *v, bool noflush) -+static inline pagetable_t paging_update_cr3(struct vcpu *v, bool noflush) - { -- paging_get_hostmode(v)->update_cr3(v, 1, noflush); -+ return paging_get_hostmode(v)->update_cr3(v, 1, noflush); - } - - /* Update all the things that are derived from the guest's CR0/CR3/CR4. -diff --git a/xen/arch/x86/include/asm/shadow.h b/xen/arch/x86/include/asm/shadow.h -index dad876d294..0b72c9eda8 100644 ---- a/xen/arch/x86/include/asm/shadow.h -+++ b/xen/arch/x86/include/asm/shadow.h -@@ -99,6 +99,9 @@ int shadow_set_allocation(struct domain *d, unsigned int pages, - - int shadow_get_allocation_bytes(struct domain *d, uint64_t *size); - -+/* Helper to invoke for deferred releasing of a top-level shadow's reference. */ -+void shadow_put_top_level(struct domain *d, pagetable_t old); -+ - #else /* !CONFIG_SHADOW_PAGING */ - - #define shadow_vcpu_teardown(v) ASSERT(is_pv_vcpu(v)) -@@ -121,6 +124,11 @@ static inline void shadow_prepare_page_type_change(struct domain *d, - - static inline void shadow_blow_tables_per_domain(struct domain *d) {} - -+static inline void shadow_put_top_level(struct domain *d, pagetable_t old) -+{ -+ ASSERT_UNREACHABLE(); -+} -+ - static inline int shadow_domctl(struct domain *d, - struct xen_domctl_shadow_op *sc, - XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) -diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c -index b46eee1332..e884a6fdbd 100644 ---- a/xen/arch/x86/mm.c -+++ b/xen/arch/x86/mm.c -@@ -567,15 +567,12 @@ void write_ptbase(struct vcpu *v) - * - * Update ref counts to shadow tables appropriately. - */ --void update_cr3(struct vcpu *v) -+pagetable_t update_cr3(struct vcpu *v) - { - mfn_t cr3_mfn; - - if ( paging_mode_enabled(v->domain) ) -- { -- paging_update_cr3(v, false); -- return; -- } -+ return paging_update_cr3(v, false); - - if ( !(v->arch.flags & TF_kernel_mode) ) - cr3_mfn = pagetable_get_mfn(v->arch.guest_table_user); -@@ -583,6 +580,8 @@ void update_cr3(struct vcpu *v) - cr3_mfn = pagetable_get_mfn(v->arch.guest_table); - - make_cr3(v, cr3_mfn); -+ -+ return pagetable_null(); - } - - static inline void set_tlbflush_timestamp(struct page_info *page) -@@ -3285,6 +3284,7 @@ int new_guest_cr3(mfn_t mfn) - struct domain *d = curr->domain; - int rc; - mfn_t old_base_mfn; -+ pagetable_t old_shadow; - - if ( is_pv_32bit_domain(d) ) - { -@@ -3352,9 +3352,22 @@ int new_guest_cr3(mfn_t mfn) - if ( !VM_ASSIST(d, m2p_strict) ) - fill_ro_mpt(mfn); - curr->arch.guest_table = pagetable_from_mfn(mfn); -- update_cr3(curr); -+ old_shadow = update_cr3(curr); -+ -+ /* -+ * In shadow mode update_cr3() can fail, in which case here we're still -+ * running on the prior top-level shadow (which we're about to release). -+ * Switch to the idle page tables in such an event; the guest will have -+ * been crashed already. -+ */ -+ if ( likely(!mfn_eq(pagetable_get_mfn(old_shadow), -+ maddr_to_mfn(curr->arch.cr3 & ~X86_CR3_NOFLUSH))) ) -+ write_ptbase(curr); -+ else -+ write_ptbase(idle_vcpu[curr->processor]); - -- write_ptbase(curr); -+ if ( !pagetable_is_null(old_shadow) ) -+ shadow_put_top_level(d, old_shadow); - - if ( likely(mfn_x(old_base_mfn) != 0) ) - { -diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c -index 0fc1b1d9ac..57a19c3d59 100644 ---- a/xen/arch/x86/mm/hap/hap.c -+++ b/xen/arch/x86/mm/hap/hap.c -@@ -739,11 +739,13 @@ static bool cf_check hap_invlpg(struct vcpu *v, unsigned long linear) - return 1; - } - --static void cf_check hap_update_cr3( -- struct vcpu *v, int do_locking, bool noflush) -+static pagetable_t cf_check hap_update_cr3( -+ struct vcpu *v, bool do_locking, bool noflush) - { - v->arch.hvm.hw_cr[3] = v->arch.hvm.guest_cr[3]; - hvm_update_guest_cr3(v, noflush); -+ -+ return pagetable_null(); - } - - static bool flush_vcpu(const struct vcpu *v, const unsigned long *vcpu_bitmap) -diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c -index cf5e181f74..c0940f939e 100644 ---- a/xen/arch/x86/mm/shadow/common.c -+++ b/xen/arch/x86/mm/shadow/common.c -@@ -2590,13 +2590,13 @@ void cf_check shadow_update_paging_modes(struct vcpu *v) - } - - /* Set up the top-level shadow and install it in slot 'slot' of shadow_table */ --void sh_set_toplevel_shadow(struct vcpu *v, -- unsigned int slot, -- mfn_t gmfn, -- unsigned int root_type, -- mfn_t (*make_shadow)(struct vcpu *v, -- mfn_t gmfn, -- uint32_t shadow_type)) -+pagetable_t sh_set_toplevel_shadow(struct vcpu *v, -+ unsigned int slot, -+ mfn_t gmfn, -+ unsigned int root_type, -+ mfn_t (*make_shadow)(struct vcpu *v, -+ mfn_t gmfn, -+ uint32_t shadow_type)) - { - mfn_t smfn; - pagetable_t old_entry, new_entry; -@@ -2653,20 +2653,37 @@ void sh_set_toplevel_shadow(struct vcpu *v, - mfn_x(gmfn), mfn_x(pagetable_get_mfn(new_entry))); - v->arch.paging.shadow.shadow_table[slot] = new_entry; - -- /* Decrement the refcount of the old contents of this slot */ -- if ( !pagetable_is_null(old_entry) ) -+ /* -+ * Decrement the refcount of the old contents of this slot, unless -+ * we're still running on that shadow - in that case it'll need holding -+ * on to until the actual page table switch did occur. -+ */ -+ if ( !pagetable_is_null(old_entry) && (v != current || !is_pv_domain(d)) ) - { -- mfn_t old_smfn = pagetable_get_mfn(old_entry); -- /* Need to repin the old toplevel shadow if it's been unpinned -- * by shadow_prealloc(): in PV mode we're still running on this -- * shadow and it's not safe to free it yet. */ -- if ( !mfn_to_page(old_smfn)->u.sh.pinned && !sh_pin(d, old_smfn) ) -- { -- printk(XENLOG_G_ERR "can't re-pin %"PRI_mfn"\n", mfn_x(old_smfn)); -- domain_crash(d); -- } -- sh_put_ref(d, old_smfn, 0); -+ sh_put_ref(d, pagetable_get_mfn(old_entry), 0); -+ old_entry = pagetable_null(); - } -+ -+ /* -+ * 2- and 3-level shadow mode is used for HVM only. Therefore we never run -+ * on such a shadow, so only call sites requesting an L4 shadow need to pay -+ * attention to the returned value. -+ */ -+ ASSERT(pagetable_is_null(old_entry) || root_type == SH_type_l4_64_shadow); -+ -+ return old_entry; -+} -+ -+/* -+ * Helper invoked when releasing of a top-level shadow's reference was -+ * deferred in sh_set_toplevel_shadow() above. -+ */ -+void shadow_put_top_level(struct domain *d, pagetable_t old_entry) -+{ -+ ASSERT(!pagetable_is_null(old_entry)); -+ paging_lock(d); -+ sh_put_ref(d, pagetable_get_mfn(old_entry), 0); -+ paging_unlock(d); - } - - /**************************************************************************/ -diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c -index 671bf8c228..c92b354a78 100644 ---- a/xen/arch/x86/mm/shadow/multi.c -+++ b/xen/arch/x86/mm/shadow/multi.c -@@ -3224,7 +3224,8 @@ static void cf_check sh_detach_old_tables(struct vcpu *v) - } - } - --static void cf_check sh_update_cr3(struct vcpu *v, int do_locking, bool noflush) -+static pagetable_t cf_check sh_update_cr3(struct vcpu *v, bool do_locking, -+ bool noflush) - /* Updates vcpu->arch.cr3 after the guest has changed CR3. - * Paravirtual guests should set v->arch.guest_table (and guest_table_user, - * if appropriate). -@@ -3238,6 +3239,7 @@ static void cf_check sh_update_cr3(struct vcpu *v, int do_locking, bool noflush) - { - struct domain *d = v->domain; - mfn_t gmfn; -+ pagetable_t old_entry = pagetable_null(); - #if GUEST_PAGING_LEVELS == 3 - const guest_l3e_t *gl3e; - unsigned int i, guest_idx; -@@ -3247,7 +3249,7 @@ static void cf_check sh_update_cr3(struct vcpu *v, int do_locking, bool noflush) - if ( !is_hvm_domain(d) && !v->is_initialised ) - { - ASSERT(v->arch.cr3 == 0); -- return; -+ return old_entry; - } - - if ( do_locking ) paging_lock(v->domain); -@@ -3320,11 +3322,12 @@ static void cf_check sh_update_cr3(struct vcpu *v, int do_locking, bool noflush) - #if GUEST_PAGING_LEVELS == 4 - if ( sh_remove_write_access(d, gmfn, 4, 0) != 0 ) - guest_flush_tlb_mask(d, d->dirty_cpumask); -- sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l4_shadow, sh_make_shadow); -+ old_entry = sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l4_shadow, -+ sh_make_shadow); - if ( unlikely(pagetable_is_null(v->arch.paging.shadow.shadow_table[0])) ) - { - ASSERT(d->is_dying || d->is_shutting_down); -- return; -+ return old_entry; - } - if ( !shadow_mode_external(d) && !is_pv_32bit_domain(d) ) - { -@@ -3368,24 +3371,30 @@ static void cf_check sh_update_cr3(struct vcpu *v, int do_locking, bool noflush) - gl2gfn = guest_l3e_get_gfn(gl3e[i]); - gl2mfn = get_gfn_query_unlocked(d, gfn_x(gl2gfn), &p2mt); - if ( p2m_is_ram(p2mt) ) -- sh_set_toplevel_shadow(v, i, gl2mfn, SH_type_l2_shadow, -- sh_make_shadow); -+ old_entry = sh_set_toplevel_shadow(v, i, gl2mfn, -+ SH_type_l2_shadow, -+ sh_make_shadow); - else -- sh_set_toplevel_shadow(v, i, INVALID_MFN, 0, -- sh_make_shadow); -+ old_entry = sh_set_toplevel_shadow(v, i, INVALID_MFN, 0, -+ sh_make_shadow); - } - else -- sh_set_toplevel_shadow(v, i, INVALID_MFN, 0, sh_make_shadow); -+ old_entry = sh_set_toplevel_shadow(v, i, INVALID_MFN, 0, -+ sh_make_shadow); -+ -+ ASSERT(pagetable_is_null(old_entry)); - } - } - #elif GUEST_PAGING_LEVELS == 2 - if ( sh_remove_write_access(d, gmfn, 2, 0) != 0 ) - guest_flush_tlb_mask(d, d->dirty_cpumask); -- sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l2_shadow, sh_make_shadow); -+ old_entry = sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l2_shadow, -+ sh_make_shadow); -+ ASSERT(pagetable_is_null(old_entry)); - if ( unlikely(pagetable_is_null(v->arch.paging.shadow.shadow_table[0])) ) - { - ASSERT(d->is_dying || d->is_shutting_down); -- return; -+ return old_entry; - } - #else - #error This should never happen -@@ -3473,6 +3482,8 @@ static void cf_check sh_update_cr3(struct vcpu *v, int do_locking, bool noflush) - - /* Release the lock, if we took it (otherwise it's the caller's problem) */ - if ( do_locking ) paging_unlock(v->domain); -+ -+ return old_entry; - } - - -diff --git a/xen/arch/x86/mm/shadow/none.c b/xen/arch/x86/mm/shadow/none.c -index eaaa874b11..743c0ffb85 100644 ---- a/xen/arch/x86/mm/shadow/none.c -+++ b/xen/arch/x86/mm/shadow/none.c -@@ -52,9 +52,11 @@ static unsigned long cf_check _gva_to_gfn( - } - #endif - --static void cf_check _update_cr3(struct vcpu *v, int do_locking, bool noflush) -+static pagetable_t cf_check _update_cr3(struct vcpu *v, bool do_locking, -+ bool noflush) - { - ASSERT_UNREACHABLE(); -+ return pagetable_null(); - } - - static void cf_check _update_paging_modes(struct vcpu *v) -diff --git a/xen/arch/x86/mm/shadow/private.h b/xen/arch/x86/mm/shadow/private.h -index c2bb1ed3c3..91f798c5aa 100644 ---- a/xen/arch/x86/mm/shadow/private.h -+++ b/xen/arch/x86/mm/shadow/private.h -@@ -391,13 +391,13 @@ mfn_t shadow_alloc(struct domain *d, - void shadow_free(struct domain *d, mfn_t smfn); - - /* Set up the top-level shadow and install it in slot 'slot' of shadow_table */ --void sh_set_toplevel_shadow(struct vcpu *v, -- unsigned int slot, -- mfn_t gmfn, -- unsigned int root_type, -- mfn_t (*make_shadow)(struct vcpu *v, -- mfn_t gmfn, -- uint32_t shadow_type)); -+pagetable_t sh_set_toplevel_shadow(struct vcpu *v, -+ unsigned int slot, -+ mfn_t gmfn, -+ unsigned int root_type, -+ mfn_t (*make_shadow)(struct vcpu *v, -+ mfn_t gmfn, -+ uint32_t shadow_type)); - - /* Update the shadows in response to a pagetable write from Xen */ - int sh_validate_guest_entry(struct vcpu *v, mfn_t gmfn, void *entry, u32 size); -diff --git a/xen/arch/x86/pv/domain.c b/xen/arch/x86/pv/domain.c -index 5c92812dc6..2a445bb17b 100644 ---- a/xen/arch/x86/pv/domain.c -+++ b/xen/arch/x86/pv/domain.c -@@ -424,10 +424,13 @@ bool __init xpti_pcid_enabled(void) - - static void _toggle_guest_pt(struct vcpu *v) - { -+ bool guest_update; -+ pagetable_t old_shadow; - unsigned long cr3; - - v->arch.flags ^= TF_kernel_mode; -- update_cr3(v); -+ guest_update = v->arch.flags & TF_kernel_mode; -+ old_shadow = update_cr3(v); - - /* - * Don't flush user global mappings from the TLB. Don't tick TLB clock. -@@ -436,13 +439,31 @@ static void _toggle_guest_pt(struct vcpu *v) - * TLB flush (for just the incoming PCID), as the top level page table may - * have changed behind our backs. To be on the safe side, suppress the - * no-flush unconditionally in this case. -+ * -+ * Furthermore in shadow mode update_cr3() can fail, in which case here -+ * we're still running on the prior top-level shadow (which we're about -+ * to release). Switch to the idle page tables in such an event; the -+ * guest will have been crashed already. - */ - cr3 = v->arch.cr3; - if ( shadow_mode_enabled(v->domain) ) -+ { - cr3 &= ~X86_CR3_NOFLUSH; -+ -+ if ( unlikely(mfn_eq(pagetable_get_mfn(old_shadow), -+ maddr_to_mfn(cr3))) ) -+ { -+ cr3 = idle_vcpu[v->processor]->arch.cr3; -+ /* Also suppress runstate/time area updates below. */ -+ guest_update = false; -+ } -+ } - write_cr3(cr3); - -- if ( !(v->arch.flags & TF_kernel_mode) ) -+ if ( !pagetable_is_null(old_shadow) ) -+ shadow_put_top_level(v->domain, old_shadow); -+ -+ if ( !guest_update ) - return; - - if ( v->arch.pv.need_update_runstate_area && update_runstate_area(v) ) --- -2.42.0 - diff --git a/0041-tools-xenstored-domain_entry_fix-Handle-conflicting-.patch b/0041-tools-xenstored-domain_entry_fix-Handle-conflicting-.patch deleted file mode 100644 index 1edecc8..0000000 --- a/0041-tools-xenstored-domain_entry_fix-Handle-conflicting-.patch +++ /dev/null @@ -1,64 +0,0 @@ -From c4e05c97f57d236040d1da5c1fbf6e3699dc86ea Mon Sep 17 00:00:00 2001 -From: Julien Grall <jgrall@amazon.com> -Date: Fri, 22 Sep 2023 11:32:16 +0100 -Subject: [PATCH 41/55] tools/xenstored: domain_entry_fix(): Handle conflicting - transaction - -The function domain_entry_fix() will be initially called to check if the -quota is correct before attempt to commit any nodes. So it would be -possible that accounting is temporarily negative. This is the case -in the following sequence: - - 1) Create 50 nodes - 2) Start two transactions - 3) Delete all the nodes in each transaction - 4) Commit the two transactions - -Because the first transaction will have succeed and updated the -accounting, there is no guarantee that 'd->nbentry + num' will still -be above 0. So the assert() would be triggered. -The assert() was introduced in dbef1f748289 ("tools/xenstore: simplify -and fix per domain node accounting") with the assumption that the -value can't be negative. As this is not true revert to the original -check but restricted to the path where we don't update. Take the -opportunity to explain the rationale behind the check. - -This CVE-2023-34323 / XSA-440. - -Fixes: dbef1f748289 ("tools/xenstore: simplify and fix per domain node accounting") -Signed-off-by: Julien Grall <jgrall@amazon.com> -Reviewed-by: Juergen Gross <jgross@suse.com> ---- - tools/xenstore/xenstored_domain.c | 14 ++++++++++++-- - 1 file changed, 12 insertions(+), 2 deletions(-) - -diff --git a/tools/xenstore/xenstored_domain.c b/tools/xenstore/xenstored_domain.c -index aa86892fed..6074df210c 100644 ---- a/tools/xenstore/xenstored_domain.c -+++ b/tools/xenstore/xenstored_domain.c -@@ -1094,10 +1094,20 @@ int domain_entry_fix(unsigned int domid, int num, bool update) - } - - cnt = d->nbentry + num; -- assert(cnt >= 0); - -- if (update) -+ if (update) { -+ assert(cnt >= 0); - d->nbentry = cnt; -+ } else if (cnt < 0) { -+ /* -+ * In a transaction when a node is being added/removed AND -+ * the same node has been added/removed outside the -+ * transaction in parallel, the result value may be negative. -+ * This is no problem, as the transaction will fail due to -+ * the resulting conflict. So override 'cnt'. -+ */ -+ cnt = 0; -+ } - - return domid_is_unprivileged(domid) ? cnt : 0; - } --- -2.42.0 - diff --git a/0042-iommu-amd-vi-flush-IOMMU-TLB-when-flushing-the-DTE.patch b/0042-iommu-amd-vi-flush-IOMMU-TLB-when-flushing-the-DTE.patch deleted file mode 100644 index 66597c2..0000000 --- a/0042-iommu-amd-vi-flush-IOMMU-TLB-when-flushing-the-DTE.patch +++ /dev/null @@ -1,186 +0,0 @@ -From 0d8f9f7f2706e8ad8dfff203173693b631339b86 Mon Sep 17 00:00:00 2001 -From: Roger Pau Monne <roger.pau@citrix.com> -Date: Tue, 13 Jun 2023 15:01:05 +0200 -Subject: [PATCH 42/55] iommu/amd-vi: flush IOMMU TLB when flushing the DTE -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -The caching invalidation guidelines from the AMD-Vi specification (48882—Rev -3.07-PUB—Oct 2022) seem to be misleading on some hardware, as devices will -malfunction (see stale DMA mappings) if some fields of the DTE are updated but -the IOMMU TLB is not flushed. This has been observed in practice on AMD -systems. Due to the lack of guidance from the currently published -specification this patch aims to increase the flushing done in order to prevent -device malfunction. - -In order to fix, issue an INVALIDATE_IOMMU_PAGES command from -amd_iommu_flush_device(), flushing all the address space. Note this requires -callers to be adjusted in order to pass the DomID on the DTE previous to the -modification. - -Some call sites don't provide a valid DomID to amd_iommu_flush_device() in -order to avoid the flush. That's because the device had address translations -disabled and hence the previous DomID on the DTE is not valid. Note the -current logic relies on the entity disabling address translations to also flush -the TLB of the in use DomID. - -Device I/O TLB flushing when ATS are enabled is not covered by the current -change, as ATS usage is not security supported. - -This is XSA-442 / CVE-2023-34326 - -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit 5fc98b97084a46884acef9320e643faf40d42212) ---- - xen/drivers/passthrough/amd/iommu.h | 3 ++- - xen/drivers/passthrough/amd/iommu_cmd.c | 10 +++++++++- - xen/drivers/passthrough/amd/iommu_guest.c | 5 +++-- - xen/drivers/passthrough/amd/iommu_init.c | 6 +++++- - xen/drivers/passthrough/amd/pci_amd_iommu.c | 14 ++++++++++---- - 5 files changed, 29 insertions(+), 9 deletions(-) - -diff --git a/xen/drivers/passthrough/amd/iommu.h b/xen/drivers/passthrough/amd/iommu.h -index 5429ada58e..a58be28bf9 100644 ---- a/xen/drivers/passthrough/amd/iommu.h -+++ b/xen/drivers/passthrough/amd/iommu.h -@@ -283,7 +283,8 @@ void amd_iommu_flush_pages(struct domain *d, unsigned long dfn, - unsigned int order); - void amd_iommu_flush_iotlb(u8 devfn, const struct pci_dev *pdev, - uint64_t gaddr, unsigned int order); --void amd_iommu_flush_device(struct amd_iommu *iommu, uint16_t bdf); -+void amd_iommu_flush_device(struct amd_iommu *iommu, uint16_t bdf, -+ domid_t domid); - void amd_iommu_flush_intremap(struct amd_iommu *iommu, uint16_t bdf); - void amd_iommu_flush_all_caches(struct amd_iommu *iommu); - -diff --git a/xen/drivers/passthrough/amd/iommu_cmd.c b/xen/drivers/passthrough/amd/iommu_cmd.c -index 40ddf366bb..cb28b36abc 100644 ---- a/xen/drivers/passthrough/amd/iommu_cmd.c -+++ b/xen/drivers/passthrough/amd/iommu_cmd.c -@@ -363,10 +363,18 @@ void amd_iommu_flush_pages(struct domain *d, - _amd_iommu_flush_pages(d, __dfn_to_daddr(dfn), order); - } - --void amd_iommu_flush_device(struct amd_iommu *iommu, uint16_t bdf) -+void amd_iommu_flush_device(struct amd_iommu *iommu, uint16_t bdf, -+ domid_t domid) - { - invalidate_dev_table_entry(iommu, bdf); - flush_command_buffer(iommu, 0); -+ -+ /* Also invalidate IOMMU TLB entries when flushing the DTE. */ -+ if ( domid != DOMID_INVALID ) -+ { -+ invalidate_iommu_pages(iommu, INV_IOMMU_ALL_PAGES_ADDRESS, domid, 0); -+ flush_command_buffer(iommu, 0); -+ } - } - - void amd_iommu_flush_intremap(struct amd_iommu *iommu, uint16_t bdf) -diff --git a/xen/drivers/passthrough/amd/iommu_guest.c b/xen/drivers/passthrough/amd/iommu_guest.c -index 80a331f546..be86bce6fb 100644 ---- a/xen/drivers/passthrough/amd/iommu_guest.c -+++ b/xen/drivers/passthrough/amd/iommu_guest.c -@@ -385,7 +385,7 @@ static int do_completion_wait(struct domain *d, cmd_entry_t *cmd) - - static int do_invalidate_dte(struct domain *d, cmd_entry_t *cmd) - { -- uint16_t gbdf, mbdf, req_id, gdom_id, hdom_id; -+ uint16_t gbdf, mbdf, req_id, gdom_id, hdom_id, prev_domid; - struct amd_iommu_dte *gdte, *mdte, *dte_base; - struct amd_iommu *iommu = NULL; - struct guest_iommu *g_iommu; -@@ -445,13 +445,14 @@ static int do_invalidate_dte(struct domain *d, cmd_entry_t *cmd) - req_id = get_dma_requestor_id(iommu->seg, mbdf); - dte_base = iommu->dev_table.buffer; - mdte = &dte_base[req_id]; -+ prev_domid = mdte->domain_id; - - spin_lock_irqsave(&iommu->lock, flags); - dte_set_gcr3_table(mdte, hdom_id, gcr3_mfn << PAGE_SHIFT, gv, glx); - - spin_unlock_irqrestore(&iommu->lock, flags); - -- amd_iommu_flush_device(iommu, req_id); -+ amd_iommu_flush_device(iommu, req_id, prev_domid); - - return 0; - } -diff --git a/xen/drivers/passthrough/amd/iommu_init.c b/xen/drivers/passthrough/amd/iommu_init.c -index 166570648d..101a60ce17 100644 ---- a/xen/drivers/passthrough/amd/iommu_init.c -+++ b/xen/drivers/passthrough/amd/iommu_init.c -@@ -1547,7 +1547,11 @@ static int cf_check _invalidate_all_devices( - req_id = ivrs_mappings[bdf].dte_requestor_id; - if ( iommu ) - { -- amd_iommu_flush_device(iommu, req_id); -+ /* -+ * IOMMU TLB flush performed separately (see -+ * invalidate_all_domain_pages()). -+ */ -+ amd_iommu_flush_device(iommu, req_id, DOMID_INVALID); - amd_iommu_flush_intremap(iommu, req_id); - } - } -diff --git a/xen/drivers/passthrough/amd/pci_amd_iommu.c b/xen/drivers/passthrough/amd/pci_amd_iommu.c -index 94e3775506..8641b84712 100644 ---- a/xen/drivers/passthrough/amd/pci_amd_iommu.c -+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c -@@ -192,10 +192,13 @@ static int __must_check amd_iommu_setup_domain_device( - - spin_unlock_irqrestore(&iommu->lock, flags); - -- amd_iommu_flush_device(iommu, req_id); -+ /* DTE didn't have DMA translations enabled, do not flush the TLB. */ -+ amd_iommu_flush_device(iommu, req_id, DOMID_INVALID); - } - else if ( dte->pt_root != mfn_x(page_to_mfn(root_pg)) ) - { -+ domid_t prev_domid = dte->domain_id; -+ - /* - * Strictly speaking if the device is the only one with this requestor - * ID, it could be allowed to be re-assigned regardless of unity map -@@ -252,7 +255,7 @@ static int __must_check amd_iommu_setup_domain_device( - - spin_unlock_irqrestore(&iommu->lock, flags); - -- amd_iommu_flush_device(iommu, req_id); -+ amd_iommu_flush_device(iommu, req_id, prev_domid); - } - else - spin_unlock_irqrestore(&iommu->lock, flags); -@@ -421,6 +424,8 @@ static void amd_iommu_disable_domain_device(const struct domain *domain, - spin_lock_irqsave(&iommu->lock, flags); - if ( dte->tv || dte->v ) - { -+ domid_t prev_domid = dte->domain_id; -+ - /* See the comment in amd_iommu_setup_device_table(). */ - dte->int_ctl = IOMMU_DEV_TABLE_INT_CONTROL_ABORTED; - smp_wmb(); -@@ -439,7 +444,7 @@ static void amd_iommu_disable_domain_device(const struct domain *domain, - - spin_unlock_irqrestore(&iommu->lock, flags); - -- amd_iommu_flush_device(iommu, req_id); -+ amd_iommu_flush_device(iommu, req_id, prev_domid); - - AMD_IOMMU_DEBUG("Disable: device id = %#x, " - "domain = %d, paging mode = %d\n", -@@ -610,7 +615,8 @@ static int cf_check amd_iommu_add_device(u8 devfn, struct pci_dev *pdev) - - spin_unlock_irqrestore(&iommu->lock, flags); - -- amd_iommu_flush_device(iommu, bdf); -+ /* DTE didn't have DMA translations enabled, do not flush the TLB. */ -+ amd_iommu_flush_device(iommu, bdf, DOMID_INVALID); - } - - if ( amd_iommu_reserve_domain_unity_map( --- -2.42.0 - diff --git a/0043-libfsimage-xfs-Remove-dead-code.patch b/0043-libfsimage-xfs-Remove-dead-code.patch deleted file mode 100644 index cbb9ad4..0000000 --- a/0043-libfsimage-xfs-Remove-dead-code.patch +++ /dev/null @@ -1,71 +0,0 @@ -From d665c6690eb3c2c86cb2c7dac09804211481f926 Mon Sep 17 00:00:00 2001 -From: Alejandro Vallejo <alejandro.vallejo@cloud.com> -Date: Thu, 14 Sep 2023 13:22:50 +0100 -Subject: [PATCH 43/55] libfsimage/xfs: Remove dead code - -xfs_info.agnolog (and related code) and XFS_INO_AGBNO_BITS are dead code -that serve no purpose. - -This is part of XSA-443 / CVE-2023-34325 - -Signed-off-by: Alejandro Vallejo <alejandro.vallejo@cloud.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit 37fc1e6c1c5c63aafd9cfd76a37728d5baea7d71) ---- - tools/libfsimage/xfs/fsys_xfs.c | 18 ------------------ - 1 file changed, 18 deletions(-) - -diff --git a/tools/libfsimage/xfs/fsys_xfs.c b/tools/libfsimage/xfs/fsys_xfs.c -index d735a88e55..2800699f59 100644 ---- a/tools/libfsimage/xfs/fsys_xfs.c -+++ b/tools/libfsimage/xfs/fsys_xfs.c -@@ -37,7 +37,6 @@ struct xfs_info { - int blklog; - int inopblog; - int agblklog; -- int agnolog; - unsigned int nextents; - xfs_daddr_t next; - xfs_daddr_t daddr; -@@ -65,9 +64,7 @@ static struct xfs_info xfs; - - #define XFS_INO_MASK(k) ((xfs_uint32_t)((1ULL << (k)) - 1)) - #define XFS_INO_OFFSET_BITS xfs.inopblog --#define XFS_INO_AGBNO_BITS xfs.agblklog - #define XFS_INO_AGINO_BITS (xfs.agblklog + xfs.inopblog) --#define XFS_INO_AGNO_BITS xfs.agnolog - - static inline xfs_agblock_t - agino2agbno (xfs_agino_t agino) -@@ -149,20 +146,6 @@ xt_len (xfs_bmbt_rec_32_t *r) - return le32(r->l3) & mask32lo(21); - } - --static inline int --xfs_highbit32(xfs_uint32_t v) --{ -- int i; -- -- if (--v) { -- for (i = 0; i < 31; i++, v >>= 1) { -- if (v == 0) -- return i; -- } -- } -- return 0; --} -- - static int - isinxt (xfs_fileoff_t key, xfs_fileoff_t offset, xfs_filblks_t len) - { -@@ -472,7 +455,6 @@ xfs_mount (fsi_file_t *ffi, const char *options) - - xfs.inopblog = super.sb_inopblog; - xfs.agblklog = super.sb_agblklog; -- xfs.agnolog = xfs_highbit32 (le32(super.sb_agcount)); - - xfs.btnode_ptr0_off = - ((xfs.bsize - sizeof(xfs_btree_block_t)) / --- -2.42.0 - diff --git a/0044-libfsimage-xfs-Amend-mask32lo-to-allow-the-value-32.patch b/0044-libfsimage-xfs-Amend-mask32lo-to-allow-the-value-32.patch deleted file mode 100644 index 880ff83..0000000 --- a/0044-libfsimage-xfs-Amend-mask32lo-to-allow-the-value-32.patch +++ /dev/null @@ -1,33 +0,0 @@ -From f1cd620cc3572c858e276463e05f695d949362c5 Mon Sep 17 00:00:00 2001 -From: Alejandro Vallejo <alejandro.vallejo@cloud.com> -Date: Thu, 14 Sep 2023 13:22:51 +0100 -Subject: [PATCH 44/55] libfsimage/xfs: Amend mask32lo() to allow the value 32 - -agblklog could plausibly be 32, but that would overflow this shift. -Perform the shift as ULL and cast to u32 at the end instead. - -This is part of XSA-443 / CVE-2023-34325 - -Signed-off-by: Alejandro Vallejo <alejandro.vallejo@cloud.com> -Acked-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit ddc45e4eea946bb373a4b4a60c84bf9339cf413b) ---- - tools/libfsimage/xfs/fsys_xfs.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/tools/libfsimage/xfs/fsys_xfs.c b/tools/libfsimage/xfs/fsys_xfs.c -index 2800699f59..4720bb4505 100644 ---- a/tools/libfsimage/xfs/fsys_xfs.c -+++ b/tools/libfsimage/xfs/fsys_xfs.c -@@ -60,7 +60,7 @@ static struct xfs_info xfs; - #define inode ((xfs_dinode_t *)((char *)FSYS_BUF + 8192)) - #define icore (inode->di_core) - --#define mask32lo(n) (((xfs_uint32_t)1 << (n)) - 1) -+#define mask32lo(n) ((xfs_uint32_t)((1ull << (n)) - 1)) - - #define XFS_INO_MASK(k) ((xfs_uint32_t)((1ULL << (k)) - 1)) - #define XFS_INO_OFFSET_BITS xfs.inopblog --- -2.42.0 - diff --git a/0045-libfsimage-xfs-Sanity-check-the-superblock-during-mo.patch b/0045-libfsimage-xfs-Sanity-check-the-superblock-during-mo.patch deleted file mode 100644 index 01ae52a..0000000 --- a/0045-libfsimage-xfs-Sanity-check-the-superblock-during-mo.patch +++ /dev/null @@ -1,137 +0,0 @@ -From 78143c5336c8316bcc648e964d65a07f216cf77f Mon Sep 17 00:00:00 2001 -From: Alejandro Vallejo <alejandro.vallejo@cloud.com> -Date: Thu, 14 Sep 2023 13:22:52 +0100 -Subject: [PATCH 45/55] libfsimage/xfs: Sanity-check the superblock during - mounts - -Sanity-check the XFS superblock for wellformedness at the mount handler. -This forces pygrub to abort parsing a potentially malformed filesystem and -ensures the invariants assumed throughout the rest of the code hold. - -Also, derive parameters from previously sanitized parameters where possible -(rather than reading them off the superblock) - -The code doesn't try to avoid overflowing the end of the disk, because -that's an unlikely and benign error. Parameters used in calculations of -xfs_daddr_t (like the root inode index) aren't in critical need of being -sanitized. - -The sanitization of agblklog is basically checking that no obvious -overflows happen on agblklog, and then ensuring agblocks is contained in -the range (2^(sb_agblklog-1), 2^sb_agblklog]. - -This is part of XSA-443 / CVE-2023-34325 - -Signed-off-by: Alejandro Vallejo <alejandro.vallejo@cloud.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit 620500dd1baf33347dfde5e7fde7cf7fe347da5c) ---- - tools/libfsimage/xfs/fsys_xfs.c | 48 ++++++++++++++++++++++++++------- - tools/libfsimage/xfs/xfs.h | 12 +++++++++ - 2 files changed, 50 insertions(+), 10 deletions(-) - -diff --git a/tools/libfsimage/xfs/fsys_xfs.c b/tools/libfsimage/xfs/fsys_xfs.c -index 4720bb4505..e4eb7e1ee2 100644 ---- a/tools/libfsimage/xfs/fsys_xfs.c -+++ b/tools/libfsimage/xfs/fsys_xfs.c -@@ -17,6 +17,7 @@ - * along with this program; If not, see <http://www.gnu.org/licenses/>. - */ - -+#include <stdbool.h> - #include <xenfsimage_grub.h> - #include "xfs.h" - -@@ -433,29 +434,56 @@ first_dentry (fsi_file_t *ffi, xfs_ino_t *ino) - return next_dentry (ffi, ino); - } - -+static bool -+xfs_sb_is_invalid (const xfs_sb_t *super) -+{ -+ return (le32(super->sb_magicnum) != XFS_SB_MAGIC) -+ || ((le16(super->sb_versionnum) & XFS_SB_VERSION_NUMBITS) != -+ XFS_SB_VERSION_4) -+ || (super->sb_inodelog < XFS_SB_INODELOG_MIN) -+ || (super->sb_inodelog > XFS_SB_INODELOG_MAX) -+ || (super->sb_blocklog < XFS_SB_BLOCKLOG_MIN) -+ || (super->sb_blocklog > XFS_SB_BLOCKLOG_MAX) -+ || (super->sb_blocklog < super->sb_inodelog) -+ || (super->sb_agblklog > XFS_SB_AGBLKLOG_MAX) -+ || ((1ull << super->sb_agblklog) < le32(super->sb_agblocks)) -+ || (((1ull << super->sb_agblklog) >> 1) >= -+ le32(super->sb_agblocks)) -+ || ((super->sb_blocklog + super->sb_dirblklog) >= -+ XFS_SB_DIRBLK_NUMBITS); -+} -+ - static int - xfs_mount (fsi_file_t *ffi, const char *options) - { - xfs_sb_t super; - - if (!devread (ffi, 0, 0, sizeof(super), (char *)&super) -- || (le32(super.sb_magicnum) != XFS_SB_MAGIC) -- || ((le16(super.sb_versionnum) -- & XFS_SB_VERSION_NUMBITS) != XFS_SB_VERSION_4) ) { -+ || xfs_sb_is_invalid(&super)) { - return 0; - } - -- xfs.bsize = le32 (super.sb_blocksize); -- xfs.blklog = super.sb_blocklog; -- xfs.bdlog = xfs.blklog - SECTOR_BITS; -+ /* -+ * Not sanitized. It's exclusively used to generate disk addresses, -+ * so it's not important from a security standpoint. -+ */ - xfs.rootino = le64 (super.sb_rootino); -- xfs.isize = le16 (super.sb_inodesize); -- xfs.agblocks = le32 (super.sb_agblocks); -- xfs.dirbsize = xfs.bsize << super.sb_dirblklog; - -- xfs.inopblog = super.sb_inopblog; -+ /* -+ * Sanitized to be consistent with each other, only used to -+ * generate disk addresses, so it's safe -+ */ -+ xfs.agblocks = le32 (super.sb_agblocks); - xfs.agblklog = super.sb_agblklog; - -+ /* Derived from sanitized parameters */ -+ xfs.bsize = 1 << super.sb_blocklog; -+ xfs.blklog = super.sb_blocklog; -+ xfs.bdlog = super.sb_blocklog - SECTOR_BITS; -+ xfs.isize = 1 << super.sb_inodelog; -+ xfs.dirbsize = 1 << (super.sb_blocklog + super.sb_dirblklog); -+ xfs.inopblog = super.sb_blocklog - super.sb_inodelog; -+ - xfs.btnode_ptr0_off = - ((xfs.bsize - sizeof(xfs_btree_block_t)) / - (sizeof (xfs_bmbt_key_t) + sizeof (xfs_bmbt_ptr_t))) -diff --git a/tools/libfsimage/xfs/xfs.h b/tools/libfsimage/xfs/xfs.h -index 40699281e4..b87e37d3d7 100644 ---- a/tools/libfsimage/xfs/xfs.h -+++ b/tools/libfsimage/xfs/xfs.h -@@ -134,6 +134,18 @@ typedef struct xfs_sb - xfs_uint8_t sb_dummy[7]; /* padding */ - } xfs_sb_t; - -+/* Bound taken from xfs.c in GRUB2. It doesn't exist in the spec */ -+#define XFS_SB_DIRBLK_NUMBITS 27 -+/* Implied by the XFS specification. The minimum block size is 512 octets */ -+#define XFS_SB_BLOCKLOG_MIN 9 -+/* Implied by the XFS specification. The maximum block size is 65536 octets */ -+#define XFS_SB_BLOCKLOG_MAX 16 -+/* Implied by the XFS specification. The minimum inode size is 256 octets */ -+#define XFS_SB_INODELOG_MIN 8 -+/* Implied by the XFS specification. The maximum inode size is 2048 octets */ -+#define XFS_SB_INODELOG_MAX 11 -+/* High bound for sb_agblklog */ -+#define XFS_SB_AGBLKLOG_MAX 32 - - /* those are from xfs_btree.h */ - --- -2.42.0 - diff --git a/0046-libfsimage-xfs-Add-compile-time-check-to-libfsimage.patch b/0046-libfsimage-xfs-Add-compile-time-check-to-libfsimage.patch deleted file mode 100644 index 0c32745..0000000 --- a/0046-libfsimage-xfs-Add-compile-time-check-to-libfsimage.patch +++ /dev/null @@ -1,62 +0,0 @@ -From eb4efdac4cc7121f832ee156f39761312878f3a5 Mon Sep 17 00:00:00 2001 -From: Alejandro Vallejo <alejandro.vallejo@cloud.com> -Date: Thu, 14 Sep 2023 13:22:53 +0100 -Subject: [PATCH 46/55] libfsimage/xfs: Add compile-time check to libfsimage - -Adds the common tools include folder to the -I compile flags -of libfsimage. This allows us to use: - xen-tools/common-macros.h:BUILD_BUG_ON() - -With it, statically assert a sanitized "blocklog - SECTOR_BITS" cannot -underflow. - -This is part of XSA-443 / CVE-2023-34325 - -Signed-off-by: Alejandro Vallejo <alejandro.vallejo@cloud.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit 7d85c70431593550e32022e3a19a37f306f49e00) ---- - tools/libfsimage/common.mk | 2 +- - tools/libfsimage/xfs/fsys_xfs.c | 4 +++- - 2 files changed, 4 insertions(+), 2 deletions(-) - -diff --git a/tools/libfsimage/common.mk b/tools/libfsimage/common.mk -index 4fc8c66795..e4336837d0 100644 ---- a/tools/libfsimage/common.mk -+++ b/tools/libfsimage/common.mk -@@ -1,7 +1,7 @@ - include $(XEN_ROOT)/tools/Rules.mk - - FSDIR := $(libdir)/xenfsimage --CFLAGS += -Wno-unknown-pragmas -I$(XEN_ROOT)/tools/libfsimage/common/ -DFSIMAGE_FSDIR=\"$(FSDIR)\" -+CFLAGS += -Wno-unknown-pragmas -I$(XEN_ROOT)/tools/libfsimage/common/ $(CFLAGS_xeninclude) -DFSIMAGE_FSDIR=\"$(FSDIR)\" - CFLAGS += -D_GNU_SOURCE - LDFLAGS += -L../common/ - -diff --git a/tools/libfsimage/xfs/fsys_xfs.c b/tools/libfsimage/xfs/fsys_xfs.c -index e4eb7e1ee2..4a8dd6f239 100644 ---- a/tools/libfsimage/xfs/fsys_xfs.c -+++ b/tools/libfsimage/xfs/fsys_xfs.c -@@ -19,6 +19,7 @@ - - #include <stdbool.h> - #include <xenfsimage_grub.h> -+#include <xen-tools/libs.h> - #include "xfs.h" - - #define MAX_LINK_COUNT 8 -@@ -477,9 +478,10 @@ xfs_mount (fsi_file_t *ffi, const char *options) - xfs.agblklog = super.sb_agblklog; - - /* Derived from sanitized parameters */ -+ BUILD_BUG_ON(XFS_SB_BLOCKLOG_MIN < SECTOR_BITS); -+ xfs.bdlog = super.sb_blocklog - SECTOR_BITS; - xfs.bsize = 1 << super.sb_blocklog; - xfs.blklog = super.sb_blocklog; -- xfs.bdlog = super.sb_blocklog - SECTOR_BITS; - xfs.isize = 1 << super.sb_inodelog; - xfs.dirbsize = 1 << (super.sb_blocklog + super.sb_dirblklog); - xfs.inopblog = super.sb_blocklog - super.sb_inodelog; --- -2.42.0 - diff --git a/0047-tools-pygrub-Remove-unnecessary-hypercall.patch b/0047-tools-pygrub-Remove-unnecessary-hypercall.patch deleted file mode 100644 index 6bdd9bb..0000000 --- a/0047-tools-pygrub-Remove-unnecessary-hypercall.patch +++ /dev/null @@ -1,60 +0,0 @@ -From 8a584126eae53a44cefb0acdbca201233a557fa5 Mon Sep 17 00:00:00 2001 -From: Alejandro Vallejo <alejandro.vallejo@cloud.com> -Date: Mon, 25 Sep 2023 18:32:21 +0100 -Subject: [PATCH 47/55] tools/pygrub: Remove unnecessary hypercall - -There's a hypercall being issued in order to determine whether PV64 is -supported, but since Xen 4.3 that's strictly true so it's not required. - -Plus, this way we can avoid mapping the privcmd interface altogether in the -depriv pygrub. - -This is part of XSA-443 / CVE-2023-34325 - -Signed-off-by: Alejandro Vallejo <alejandro.vallejo@cloud.com> -Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> -(cherry picked from commit f4b504c6170c446e61055cbd388ae4e832a9deca) ---- - tools/pygrub/src/pygrub | 12 +----------- - 1 file changed, 1 insertion(+), 11 deletions(-) - -diff --git a/tools/pygrub/src/pygrub b/tools/pygrub/src/pygrub -index ce7ab0eb8c..ce4e07d3e8 100755 ---- a/tools/pygrub/src/pygrub -+++ b/tools/pygrub/src/pygrub -@@ -18,7 +18,6 @@ import os, sys, string, struct, tempfile, re, traceback, stat, errno - import copy - import logging - import platform --import xen.lowlevel.xc - - import curses, _curses, curses.textpad, curses.ascii - import getopt -@@ -668,14 +667,6 @@ def run_grub(file, entry, fs, cfg_args): - - return grubcfg - --def supports64bitPVguest(): -- xc = xen.lowlevel.xc.xc() -- caps = xc.xeninfo()['xen_caps'].split(" ") -- for cap in caps: -- if cap == "xen-3.0-x86_64": -- return True -- return False -- - # If nothing has been specified, look for a Solaris domU. If found, perform the - # necessary tweaks. - def sniff_solaris(fs, cfg): -@@ -684,8 +675,7 @@ def sniff_solaris(fs, cfg): - return cfg - - if not cfg["kernel"]: -- if supports64bitPVguest() and \ -- fs.file_exists("/platform/i86xpv/kernel/amd64/unix"): -+ if fs.file_exists("/platform/i86xpv/kernel/amd64/unix"): - cfg["kernel"] = "/platform/i86xpv/kernel/amd64/unix" - cfg["ramdisk"] = "/platform/i86pc/amd64/boot_archive" - elif fs.file_exists("/platform/i86xpv/kernel/unix"): --- -2.42.0 - diff --git a/0048-tools-pygrub-Small-refactors.patch b/0048-tools-pygrub-Small-refactors.patch deleted file mode 100644 index 55b238c..0000000 --- a/0048-tools-pygrub-Small-refactors.patch +++ /dev/null @@ -1,65 +0,0 @@ -From e7059f16f7c2b99fea30b9671fec74c0375eee8f Mon Sep 17 00:00:00 2001 -From: Alejandro Vallejo <alejandro.vallejo@cloud.com> -Date: Mon, 25 Sep 2023 18:32:22 +0100 -Subject: [PATCH 48/55] tools/pygrub: Small refactors - -Small tidy up to ensure output_directory always has a trailing '/' to ease -concatenating paths and that `output` can only be a filename or None. - -This is part of XSA-443 / CVE-2023-34325 - -Signed-off-by: Alejandro Vallejo <alejandro.vallejo@cloud.com> -(cherry picked from commit 9f2ff9a7c9b3ac734ae99f17f0134ed0343dcccf) ---- - tools/pygrub/src/pygrub | 10 +++++----- - 1 file changed, 5 insertions(+), 5 deletions(-) - -diff --git a/tools/pygrub/src/pygrub b/tools/pygrub/src/pygrub -index ce4e07d3e8..1042c05b86 100755 ---- a/tools/pygrub/src/pygrub -+++ b/tools/pygrub/src/pygrub -@@ -793,7 +793,7 @@ if __name__ == "__main__": - debug = False - not_really = False - output_format = "sxp" -- output_directory = "/var/run/xen/pygrub" -+ output_directory = "/var/run/xen/pygrub/" - - # what was passed in - incfg = { "kernel": None, "ramdisk": None, "args": "" } -@@ -815,7 +815,8 @@ if __name__ == "__main__": - usage() - sys.exit() - elif o in ("--output",): -- output = a -+ if a != "-": -+ output = a - elif o in ("--kernel",): - incfg["kernel"] = a - elif o in ("--ramdisk",): -@@ -847,12 +848,11 @@ if __name__ == "__main__": - if not os.path.isdir(a): - print("%s is not an existing directory" % a) - sys.exit(1) -- output_directory = a -+ output_directory = a + '/' - - if debug: - logging.basicConfig(level=logging.DEBUG) - -- - try: - os.makedirs(output_directory, 0o700) - except OSError as e: -@@ -861,7 +861,7 @@ if __name__ == "__main__": - else: - raise - -- if output is None or output == "-": -+ if output is None: - fd = sys.stdout.fileno() - else: - fd = os.open(output, os.O_WRONLY) --- -2.42.0 - diff --git a/0049-tools-pygrub-Open-the-output-files-earlier.patch b/0049-tools-pygrub-Open-the-output-files-earlier.patch deleted file mode 100644 index c3b00b1..0000000 --- a/0049-tools-pygrub-Open-the-output-files-earlier.patch +++ /dev/null @@ -1,105 +0,0 @@ -From 37977420670c65db220349510599d3fe47600ad8 Mon Sep 17 00:00:00 2001 -From: Alejandro Vallejo <alejandro.vallejo@cloud.com> -Date: Mon, 25 Sep 2023 18:32:23 +0100 -Subject: [PATCH 49/55] tools/pygrub: Open the output files earlier - -This patch allows pygrub to get ahold of every RW file descriptor it needs -early on. A later patch will clamp the filesystem it can access so it can't -obtain any others. - -This is part of XSA-443 / CVE-2023-34325 - -Signed-off-by: Alejandro Vallejo <alejandro.vallejo@cloud.com> -(cherry picked from commit 0710d7d44586251bfca9758890616dc3d6de8a74) ---- - tools/pygrub/src/pygrub | 37 ++++++++++++++++++++++--------------- - 1 file changed, 22 insertions(+), 15 deletions(-) - -diff --git a/tools/pygrub/src/pygrub b/tools/pygrub/src/pygrub -index 1042c05b86..91e2ec2ab1 100755 ---- a/tools/pygrub/src/pygrub -+++ b/tools/pygrub/src/pygrub -@@ -738,8 +738,7 @@ if __name__ == "__main__": - def usage(): - print("Usage: %s [-q|--quiet] [-i|--interactive] [-l|--list-entries] [-n|--not-really] [--output=] [--kernel=] [--ramdisk=] [--args=] [--entry=] [--output-directory=] [--output-format=sxp|simple|simple0] [--offset=] <image>" %(sys.argv[0],), file=sys.stderr) - -- def copy_from_image(fs, file_to_read, file_type, output_directory, -- not_really): -+ def copy_from_image(fs, file_to_read, file_type, fd_dst, path_dst, not_really): - if not_really: - if fs.file_exists(file_to_read): - return "<%s:%s>" % (file_type, file_to_read) -@@ -750,21 +749,18 @@ if __name__ == "__main__": - except Exception as e: - print(e, file=sys.stderr) - sys.exit("Error opening %s in guest" % file_to_read) -- (tfd, ret) = tempfile.mkstemp(prefix="boot_"+file_type+".", -- dir=output_directory) - dataoff = 0 - while True: - data = datafile.read(FS_READ_MAX, dataoff) - if len(data) == 0: -- os.close(tfd) -+ os.close(fd_dst) - del datafile -- return ret -+ return - try: -- os.write(tfd, data) -+ os.write(fd_dst, data) - except Exception as e: - print(e, file=sys.stderr) -- os.close(tfd) -- os.unlink(ret) -+ os.unlink(path_dst) - del datafile - sys.exit("Error writing temporary copy of "+file_type) - dataoff += len(data) -@@ -861,6 +857,14 @@ if __name__ == "__main__": - else: - raise - -+ if not_really: -+ fd_kernel = path_kernel = fd_ramdisk = path_ramdisk = None -+ else: -+ (fd_kernel, path_kernel) = tempfile.mkstemp(prefix="boot_kernel.", -+ dir=output_directory) -+ (fd_ramdisk, path_ramdisk) = tempfile.mkstemp(prefix="boot_ramdisk.", -+ dir=output_directory) -+ - if output is None: - fd = sys.stdout.fileno() - else: -@@ -920,20 +924,23 @@ if __name__ == "__main__": - if fs is None: - raise RuntimeError("Unable to find partition containing kernel") - -- bootcfg["kernel"] = copy_from_image(fs, chosencfg["kernel"], "kernel", -- output_directory, not_really) -+ copy_from_image(fs, chosencfg["kernel"], "kernel", -+ fd_kernel, path_kernel, not_really) -+ bootcfg["kernel"] = path_kernel - - if chosencfg["ramdisk"]: - try: -- bootcfg["ramdisk"] = copy_from_image(fs, chosencfg["ramdisk"], -- "ramdisk", output_directory, -- not_really) -+ copy_from_image(fs, chosencfg["ramdisk"], "ramdisk", -+ fd_ramdisk, path_ramdisk, not_really) - except: - if not not_really: -- os.unlink(bootcfg["kernel"]) -+ os.unlink(path_kernel) - raise -+ bootcfg["ramdisk"] = path_ramdisk - else: - initrd = None -+ if not not_really: -+ os.unlink(path_ramdisk) - - args = None - if chosencfg["args"]: --- -2.42.0 - diff --git a/0050-tools-libfsimage-Export-a-new-function-to-preload-al.patch b/0050-tools-libfsimage-Export-a-new-function-to-preload-al.patch deleted file mode 100644 index 949528d..0000000 --- a/0050-tools-libfsimage-Export-a-new-function-to-preload-al.patch +++ /dev/null @@ -1,126 +0,0 @@ -From 8ee19246ad2c1d0ce241a52683f56b144a4f0b0e Mon Sep 17 00:00:00 2001 -From: Alejandro Vallejo <alejandro.vallejo@cloud.com> -Date: Mon, 25 Sep 2023 18:32:24 +0100 -Subject: [PATCH 50/55] tools/libfsimage: Export a new function to preload all - plugins - -This is work required in order to let pygrub operate in highly deprivileged -chroot mode. This patch adds a function that preloads every plugin, hence -ensuring that a on function exit, every shared library is loaded in memory. - -The new "init" function is supposed to be used before depriv, but that's -fine because it's not acting on untrusted data. - -This is part of XSA-443 / CVE-2023-34325 - -Signed-off-by: Alejandro Vallejo <alejandro.vallejo@cloud.com> -(cherry picked from commit 990e65c3ad9ac08642ce62a92852c80be6c83e96) ---- - tools/libfsimage/common/fsimage_plugin.c | 4 ++-- - tools/libfsimage/common/mapfile-GNU | 1 + - tools/libfsimage/common/mapfile-SunOS | 1 + - tools/libfsimage/common/xenfsimage.h | 8 ++++++++ - tools/pygrub/src/fsimage/fsimage.c | 15 +++++++++++++++ - 5 files changed, 27 insertions(+), 2 deletions(-) - -diff --git a/tools/libfsimage/common/fsimage_plugin.c b/tools/libfsimage/common/fsimage_plugin.c -index de1412b423..d0cb9e96a6 100644 ---- a/tools/libfsimage/common/fsimage_plugin.c -+++ b/tools/libfsimage/common/fsimage_plugin.c -@@ -119,7 +119,7 @@ fail: - return (-1); - } - --static int load_plugins(void) -+int fsi_init(void) - { - const char *fsdir = getenv("XEN_FSIMAGE_FSDIR"); - struct dirent *dp = NULL; -@@ -180,7 +180,7 @@ int find_plugin(fsi_t *fsi, const char *path, const char *options) - fsi_plugin_t *fp; - int ret = 0; - -- if (plugins == NULL && (ret = load_plugins()) != 0) -+ if (plugins == NULL && (ret = fsi_init()) != 0) - goto out; - - for (fp = plugins; fp != NULL; fp = fp->fp_next) { -diff --git a/tools/libfsimage/common/mapfile-GNU b/tools/libfsimage/common/mapfile-GNU -index 26d4d7a69e..2d54d527d7 100644 ---- a/tools/libfsimage/common/mapfile-GNU -+++ b/tools/libfsimage/common/mapfile-GNU -@@ -1,6 +1,7 @@ - VERSION { - libfsimage.so.1.0 { - global: -+ fsi_init; - fsi_open_fsimage; - fsi_close_fsimage; - fsi_file_exists; -diff --git a/tools/libfsimage/common/mapfile-SunOS b/tools/libfsimage/common/mapfile-SunOS -index e99b90b650..48deedb425 100644 ---- a/tools/libfsimage/common/mapfile-SunOS -+++ b/tools/libfsimage/common/mapfile-SunOS -@@ -1,5 +1,6 @@ - libfsimage.so.1.0 { - global: -+ fsi_init; - fsi_open_fsimage; - fsi_close_fsimage; - fsi_file_exists; -diff --git a/tools/libfsimage/common/xenfsimage.h b/tools/libfsimage/common/xenfsimage.h -index 201abd54f2..341883b2d7 100644 ---- a/tools/libfsimage/common/xenfsimage.h -+++ b/tools/libfsimage/common/xenfsimage.h -@@ -35,6 +35,14 @@ extern C { - typedef struct fsi fsi_t; - typedef struct fsi_file fsi_file_t; - -+/* -+ * Optional initialization function. If invoked it loads the associated -+ * dynamic libraries for the backends ahead of time. This is required if -+ * the library is to run as part of a highly deprivileged executable, as -+ * the libraries may not be reachable after depriv. -+ */ -+int fsi_init(void); -+ - fsi_t *fsi_open_fsimage(const char *, uint64_t, const char *); - void fsi_close_fsimage(fsi_t *); - -diff --git a/tools/pygrub/src/fsimage/fsimage.c b/tools/pygrub/src/fsimage/fsimage.c -index 2ebbbe35df..92fbf2851f 100644 ---- a/tools/pygrub/src/fsimage/fsimage.c -+++ b/tools/pygrub/src/fsimage/fsimage.c -@@ -286,6 +286,15 @@ fsimage_getbootstring(PyObject *o, PyObject *args) - return Py_BuildValue("s", bootstring); - } - -+static PyObject * -+fsimage_init(PyObject *o, PyObject *args) -+{ -+ if (!PyArg_ParseTuple(args, "")) -+ return (NULL); -+ -+ return Py_BuildValue("i", fsi_init()); -+} -+ - PyDoc_STRVAR(fsimage_open__doc__, - "open(name, [offset=off]) - Open the given file as a filesystem image.\n" - "\n" -@@ -297,7 +306,13 @@ PyDoc_STRVAR(fsimage_getbootstring__doc__, - "getbootstring(fs) - Return the boot string needed for this file system " - "or NULL if none is needed.\n"); - -+PyDoc_STRVAR(fsimage_init__doc__, -+ "init() - Loads every dynamic library contained in xenfsimage " -+ "into memory so that it can be used in chrooted environments.\n"); -+ - static struct PyMethodDef fsimage_module_methods[] = { -+ { "init", (PyCFunction)fsimage_init, -+ METH_VARARGS, fsimage_init__doc__ }, - { "open", (PyCFunction)fsimage_open, - METH_VARARGS|METH_KEYWORDS, fsimage_open__doc__ }, - { "getbootstring", (PyCFunction)fsimage_getbootstring, --- -2.42.0 - diff --git a/0051-tools-pygrub-Deprivilege-pygrub.patch b/0051-tools-pygrub-Deprivilege-pygrub.patch deleted file mode 100644 index 1d89191..0000000 --- a/0051-tools-pygrub-Deprivilege-pygrub.patch +++ /dev/null @@ -1,307 +0,0 @@ -From f5e211654e5fbb7f1fc5cfea7f9c7ab525edb9e7 Mon Sep 17 00:00:00 2001 -From: Alejandro Vallejo <alejandro.vallejo@cloud.com> -Date: Mon, 25 Sep 2023 18:32:25 +0100 -Subject: [PATCH 51/55] tools/pygrub: Deprivilege pygrub - -Introduce a --runas=<uid> flag to deprivilege pygrub on Linux and *BSDs. It -also implicitly creates a chroot env where it drops a deprivileged forked -process. The chroot itself is cleaned up at the end. - -If the --runas arg is present, then pygrub forks, leaving the child to -deprivilege itself, and waiting for it to complete. When the child exists, -the parent performs cleanup and exits with the same error code. - -This is roughly what the child does: - 1. Initialize libfsimage (this loads every .so in memory so the chroot - can avoid bind-mounting /{,usr}/lib* - 2. Create a temporary empty chroot directory - 3. Mount tmpfs in it - 4. Bind mount the disk inside, because libfsimage expects a path, not a - file descriptor. - 5. Remount the root tmpfs to be stricter (ro,nosuid,nodev) - 6. Set RLIMIT_FSIZE to a sensibly high amount (128 MiB) - 7. Depriv gid, groups and uid - -With this scheme in place, the "output" files are writable (up to -RLIMIT_FSIZE octets) and the exposed filesystem is immutable and contains -the single only file we can't easily get rid of (the disk). - -If running on Linux, the child process also unshares mount, IPC, and -network namespaces before dropping its privileges. - -This is part of XSA-443 / CVE-2023-34325 - -Signed-off-by: Alejandro Vallejo <alejandro.vallejo@cloud.com> -(cherry picked from commit e0342ae5556f2b6e2db50701b8a0679a45822ca6) ---- - tools/pygrub/setup.py | 2 +- - tools/pygrub/src/pygrub | 162 +++++++++++++++++++++++++++++++++++++--- - 2 files changed, 154 insertions(+), 10 deletions(-) - -diff --git a/tools/pygrub/setup.py b/tools/pygrub/setup.py -index 0e4e3d02d3..06b96733d0 100644 ---- a/tools/pygrub/setup.py -+++ b/tools/pygrub/setup.py -@@ -17,7 +17,7 @@ xenfsimage = Extension("xenfsimage", - pkgs = [ 'grub' ] - - setup(name='pygrub', -- version='0.6', -+ version='0.7', - description='Boot loader that looks a lot like grub for Xen', - author='Jeremy Katz', - author_email='katzj@redhat.com', -diff --git a/tools/pygrub/src/pygrub b/tools/pygrub/src/pygrub -index 91e2ec2ab1..7cea496ade 100755 ---- a/tools/pygrub/src/pygrub -+++ b/tools/pygrub/src/pygrub -@@ -16,8 +16,11 @@ from __future__ import print_function - - import os, sys, string, struct, tempfile, re, traceback, stat, errno - import copy -+import ctypes, ctypes.util - import logging - import platform -+import resource -+import subprocess - - import curses, _curses, curses.textpad, curses.ascii - import getopt -@@ -27,10 +30,135 @@ import grub.GrubConf - import grub.LiloConf - import grub.ExtLinuxConf - --PYGRUB_VER = 0.6 -+PYGRUB_VER = 0.7 - FS_READ_MAX = 1024 * 1024 - SECTOR_SIZE = 512 - -+# Unless provided through the env variable PYGRUB_MAX_FILE_SIZE_MB, then -+# this is the maximum filesize allowed for files written by the depriv -+# pygrub -+LIMIT_FSIZE = 128 << 20 -+ -+CLONE_NEWNS = 0x00020000 # mount namespace -+CLONE_NEWNET = 0x40000000 # network namespace -+CLONE_NEWIPC = 0x08000000 # IPC namespace -+ -+def unshare(flags): -+ if not sys.platform.startswith("linux"): -+ print("skip_unshare reason=not_linux platform=%s", sys.platform, file=sys.stderr) -+ return -+ -+ libc = ctypes.CDLL(ctypes.util.find_library('c'), use_errno=True) -+ unshare_prototype = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int, use_errno=True) -+ unshare = unshare_prototype(('unshare', libc)) -+ -+ if unshare(flags) < 0: -+ raise OSError(ctypes.get_errno(), os.strerror(ctypes.get_errno())) -+ -+def bind_mount(src, dst, options): -+ open(dst, "a").close() # touch -+ -+ rc = subprocess.call(["mount", "--bind", "-o", options, src, dst]) -+ if rc != 0: -+ raise RuntimeError("bad_mount: src=%s dst=%s opts=%s" % -+ (src, dst, options)) -+ -+def downgrade_rlimits(): -+ # Wipe the authority to use unrequired resources -+ resource.setrlimit(resource.RLIMIT_NPROC, (0, 0)) -+ resource.setrlimit(resource.RLIMIT_CORE, (0, 0)) -+ resource.setrlimit(resource.RLIMIT_MEMLOCK, (0, 0)) -+ -+ # py2's resource module doesn't know about resource.RLIMIT_MSGQUEUE -+ # -+ # TODO: Use resource.RLIMIT_MSGQUEUE after python2 is deprecated -+ if sys.platform.startswith('linux'): -+ RLIMIT_MSGQUEUE = 12 -+ resource.setrlimit(RLIMIT_MSGQUEUE, (0, 0)) -+ -+ # The final look of the filesystem for this process is fully RO, but -+ # note we have some file descriptor already open (notably, kernel and -+ # ramdisk). In order to avoid a compromised pygrub from filling up the -+ # filesystem we set RLIMIT_FSIZE to a high bound, so that the file -+ # write permissions are bound. -+ fsize = LIMIT_FSIZE -+ if "PYGRUB_MAX_FILE_SIZE_MB" in os.environ.keys(): -+ fsize = os.environ["PYGRUB_MAX_FILE_SIZE_MB"] << 20 -+ -+ resource.setrlimit(resource.RLIMIT_FSIZE, (fsize, fsize)) -+ -+def depriv(output_directory, output, device, uid, path_kernel, path_ramdisk): -+ # The only point of this call is to force the loading of libfsimage. -+ # That way, we don't need to bind-mount it into the chroot -+ rc = xenfsimage.init() -+ if rc != 0: -+ os.unlink(path_ramdisk) -+ os.unlink(path_kernel) -+ raise RuntimeError("bad_xenfsimage: rc=%d" % rc) -+ -+ # Create a temporary directory for the chroot -+ chroot = tempfile.mkdtemp(prefix=str(uid)+'-', dir=output_directory) + '/' -+ device_path = '/device' -+ -+ pid = os.fork() -+ if pid: -+ # parent -+ _, rc = os.waitpid(pid, 0) -+ -+ for path in [path_kernel, path_ramdisk]: -+ # If the child didn't write anything, just get rid of it, -+ # otherwise we end up consuming a 0-size file when parsing -+ # systems without a ramdisk that the ultimate caller of pygrub -+ # may just be unaware of -+ if rc != 0 or os.path.getsize(path) == 0: -+ os.unlink(path) -+ -+ # Normally, unshare(CLONE_NEWNS) will ensure this is not required. -+ # However, this syscall doesn't exist in *BSD systems and doesn't -+ # auto-unmount everything on older Linux kernels (At least as of -+ # Linux 4.19, but it seems fixed in 5.15). Either way, -+ # recursively unmount everything if needed. Quietly. -+ with open('/dev/null', 'w') as devnull: -+ subprocess.call(["umount", "-f", chroot + device_path], -+ stdout=devnull, stderr=devnull) -+ subprocess.call(["umount", "-f", chroot], -+ stdout=devnull, stderr=devnull) -+ os.rmdir(chroot) -+ -+ sys.exit(rc) -+ -+ # By unsharing the namespace we're making sure it's all bulk-released -+ # at the end, when the namespaces disappear. This means the kernel does -+ # (almost) all the cleanup for us and the parent just has to remove the -+ # temporary directory. -+ unshare(CLONE_NEWNS | CLONE_NEWIPC | CLONE_NEWNET) -+ -+ # Set sensible limits using the setrlimit interface -+ downgrade_rlimits() -+ -+ # We'll mount tmpfs on the chroot to ensure the deprivileged child -+ # cannot affect the persistent state. It's RW now in order to -+ # bind-mount the device, but note it's remounted RO after that. -+ rc = subprocess.call(["mount", "-t", "tmpfs", "none", chroot]) -+ if rc != 0: -+ raise RuntimeError("mount_tmpfs rc=%d dst=\"%s\"" % (rc, chroot)) -+ -+ # Bind the untrusted device RO -+ bind_mount(device, chroot + device_path, "ro,nosuid,noexec") -+ -+ rc = subprocess.call(["mount", "-t", "tmpfs", "-o", "remount,ro,nosuid,noexec,nodev", "none", chroot]) -+ if rc != 0: -+ raise RuntimeError("remount_tmpfs rc=%d dst=\"%s\"" % (rc, chroot)) -+ -+ # Drop superpowers! -+ os.chroot(chroot) -+ os.chdir('/') -+ os.setgid(uid) -+ os.setgroups([uid]) -+ os.setuid(uid) -+ -+ return device_path -+ - def read_size_roundup(fd, size): - if platform.system() != 'FreeBSD': - return size -@@ -736,7 +864,7 @@ if __name__ == "__main__": - sel = None - - def usage(): -- print("Usage: %s [-q|--quiet] [-i|--interactive] [-l|--list-entries] [-n|--not-really] [--output=] [--kernel=] [--ramdisk=] [--args=] [--entry=] [--output-directory=] [--output-format=sxp|simple|simple0] [--offset=] <image>" %(sys.argv[0],), file=sys.stderr) -+ print("Usage: %s [-q|--quiet] [-i|--interactive] [-l|--list-entries] [-n|--not-really] [--output=] [--kernel=] [--ramdisk=] [--args=] [--entry=] [--output-directory=] [--output-format=sxp|simple|simple0] [--runas=] [--offset=] <image>" %(sys.argv[0],), file=sys.stderr) - - def copy_from_image(fs, file_to_read, file_type, fd_dst, path_dst, not_really): - if not_really: -@@ -760,7 +888,8 @@ if __name__ == "__main__": - os.write(fd_dst, data) - except Exception as e: - print(e, file=sys.stderr) -- os.unlink(path_dst) -+ if path_dst: -+ os.unlink(path_dst) - del datafile - sys.exit("Error writing temporary copy of "+file_type) - dataoff += len(data) -@@ -769,7 +898,7 @@ if __name__ == "__main__": - opts, args = getopt.gnu_getopt(sys.argv[1:], 'qilnh::', - ["quiet", "interactive", "list-entries", "not-really", "help", - "output=", "output-format=", "output-directory=", "offset=", -- "entry=", "kernel=", -+ "runas=", "entry=", "kernel=", - "ramdisk=", "args=", "isconfig", "debug"]) - except getopt.GetoptError: - usage() -@@ -790,6 +919,7 @@ if __name__ == "__main__": - not_really = False - output_format = "sxp" - output_directory = "/var/run/xen/pygrub/" -+ uid = None - - # what was passed in - incfg = { "kernel": None, "ramdisk": None, "args": "" } -@@ -813,6 +943,13 @@ if __name__ == "__main__": - elif o in ("--output",): - if a != "-": - output = a -+ elif o in ("--runas",): -+ try: -+ uid = int(a) -+ except ValueError: -+ print("runas value must be an integer user id") -+ usage() -+ sys.exit(1) - elif o in ("--kernel",): - incfg["kernel"] = a - elif o in ("--ramdisk",): -@@ -849,6 +986,10 @@ if __name__ == "__main__": - if debug: - logging.basicConfig(level=logging.DEBUG) - -+ if interactive and uid: -+ print("In order to use --runas, you must also set --entry or -q", file=sys.stderr) -+ sys.exit(1) -+ - try: - os.makedirs(output_directory, 0o700) - except OSError as e: -@@ -870,6 +1011,9 @@ if __name__ == "__main__": - else: - fd = os.open(output, os.O_WRONLY) - -+ if uid: -+ file = depriv(output_directory, output, file, uid, path_kernel, path_ramdisk) -+ - # debug - if isconfig: - chosencfg = run_grub(file, entry, fs, incfg["args"]) -@@ -925,21 +1069,21 @@ if __name__ == "__main__": - raise RuntimeError("Unable to find partition containing kernel") - - copy_from_image(fs, chosencfg["kernel"], "kernel", -- fd_kernel, path_kernel, not_really) -+ fd_kernel, None if uid else path_kernel, not_really) - bootcfg["kernel"] = path_kernel - - if chosencfg["ramdisk"]: - try: - copy_from_image(fs, chosencfg["ramdisk"], "ramdisk", -- fd_ramdisk, path_ramdisk, not_really) -+ fd_ramdisk, None if uid else path_ramdisk, not_really) - except: -- if not not_really: -- os.unlink(path_kernel) -+ if not uid and not not_really: -+ os.unlink(path_kernel) - raise - bootcfg["ramdisk"] = path_ramdisk - else: - initrd = None -- if not not_really: -+ if not uid and not not_really: - os.unlink(path_ramdisk) - - args = None --- -2.42.0 - diff --git a/0052-libxl-add-support-for-running-bootloader-in-restrict.patch b/0052-libxl-add-support-for-running-bootloader-in-restrict.patch deleted file mode 100644 index 08691b9..0000000 --- a/0052-libxl-add-support-for-running-bootloader-in-restrict.patch +++ /dev/null @@ -1,251 +0,0 @@ -From 42bf49d74b711ca7fef37bcde12928220c8e9700 Mon Sep 17 00:00:00 2001 -From: Roger Pau Monne <roger.pau@citrix.com> -Date: Mon, 25 Sep 2023 14:30:20 +0200 -Subject: [PATCH 52/55] libxl: add support for running bootloader in restricted - mode -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Much like the device model depriv mode, add the same kind of support for the -bootloader. Such feature allows passing a UID as a parameter for the -bootloader to run as, together with the bootloader itself taking the necessary -actions to isolate. - -Note that the user to run the bootloader as must have the right permissions to -access the guest disk image (in read mode only), and that the bootloader will -be run in non-interactive mode when restricted. - -If enabled bootloader restrict mode will attempt to re-use the user(s) from the -QEMU depriv implementation if no user is provided on the configuration file or -the environment. See docs/features/qemu-deprivilege.pandoc for more -information about how to setup those users. - -Bootloader restrict mode is not enabled by default as it requires certain -setup to be done first (setup of the user(s) to use in restrict mode). - -This is part of XSA-443 / CVE-2023-34325 - -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: Anthony PERARD <anthony.perard@citrix.com> -(cherry picked from commit 1f762642d2cad1a40634e3280361928109d902f1) ---- - docs/man/xl.1.pod.in | 33 +++++++++++ - tools/libs/light/libxl_bootloader.c | 89 ++++++++++++++++++++++++++++- - tools/libs/light/libxl_dm.c | 8 +-- - tools/libs/light/libxl_internal.h | 8 +++ - 4 files changed, 131 insertions(+), 7 deletions(-) - -diff --git a/docs/man/xl.1.pod.in b/docs/man/xl.1.pod.in -index 101e14241d..4831e12242 100644 ---- a/docs/man/xl.1.pod.in -+++ b/docs/man/xl.1.pod.in -@@ -1957,6 +1957,39 @@ ignored: - - =back - -+=head1 ENVIRONMENT VARIABLES -+ -+The following environment variables shall affect the execution of xl: -+ -+=over 4 -+ -+=item LIBXL_BOOTLOADER_RESTRICT -+ -+Attempt to restrict the bootloader after startup, to limit the -+consequences of security vulnerabilities due to parsing guest -+owned image files. -+ -+See docs/features/qemu-deprivilege.pandoc for more information -+on how to setup the unprivileged users. -+ -+Note that running the bootloader in restricted mode also implies using -+non-interactive mode, and the disk image must be readable by the -+restricted user. -+ -+Having this variable set is equivalent to enabling the option, even if the -+value is 0. -+ -+=item LIBXL_BOOTLOADER_USER -+ -+When using bootloader_restrict, run the bootloader as this user. If -+not set the default QEMU restrict users will be used. -+ -+NOTE: Each domain MUST have a SEPARATE username. -+ -+See docs/features/qemu-deprivilege.pandoc for more information. -+ -+=back -+ - =head1 SEE ALSO - - The following man pages: -diff --git a/tools/libs/light/libxl_bootloader.c b/tools/libs/light/libxl_bootloader.c -index 108329b4a5..23c0ef3e89 100644 ---- a/tools/libs/light/libxl_bootloader.c -+++ b/tools/libs/light/libxl_bootloader.c -@@ -14,6 +14,7 @@ - - #include "libxl_osdeps.h" /* must come before any other headers */ - -+#include <pwd.h> - #include <termios.h> - #ifdef HAVE_UTMP_H - #include <utmp.h> -@@ -42,8 +43,71 @@ static void bootloader_arg(libxl__bootloader_state *bl, const char *arg) - bl->args[bl->nargs++] = arg; - } - --static void make_bootloader_args(libxl__gc *gc, libxl__bootloader_state *bl, -- const char *bootloader_path) -+static int bootloader_uid(libxl__gc *gc, domid_t guest_domid, -+ const char *user, uid_t *intended_uid) -+{ -+ struct passwd *user_base, user_pwbuf; -+ int rc; -+ -+ if (user) { -+ rc = userlookup_helper_getpwnam(gc, user, &user_pwbuf, &user_base); -+ if (rc) return rc; -+ -+ if (!user_base) { -+ LOGD(ERROR, guest_domid, "Couldn't find user %s", user); -+ return ERROR_INVAL; -+ } -+ -+ *intended_uid = user_base->pw_uid; -+ return 0; -+ } -+ -+ /* Re-use QEMU user range for the bootloader. */ -+ rc = userlookup_helper_getpwnam(gc, LIBXL_QEMU_USER_RANGE_BASE, -+ &user_pwbuf, &user_base); -+ if (rc) return rc; -+ -+ if (user_base) { -+ struct passwd *user_clash, user_clash_pwbuf; -+ uid_t temp_uid = user_base->pw_uid + guest_domid; -+ -+ rc = userlookup_helper_getpwuid(gc, temp_uid, &user_clash_pwbuf, -+ &user_clash); -+ if (rc) return rc; -+ -+ if (user_clash) { -+ LOGD(ERROR, guest_domid, -+ "wanted to use uid %ld (%s + %d) but that is user %s !", -+ (long)temp_uid, LIBXL_QEMU_USER_RANGE_BASE, -+ guest_domid, user_clash->pw_name); -+ return ERROR_INVAL; -+ } -+ -+ *intended_uid = temp_uid; -+ return 0; -+ } -+ -+ rc = userlookup_helper_getpwnam(gc, LIBXL_QEMU_USER_SHARED, &user_pwbuf, -+ &user_base); -+ if (rc) return rc; -+ -+ if (user_base) { -+ LOGD(WARN, guest_domid, "Could not find user %s, falling back to %s", -+ LIBXL_QEMU_USER_RANGE_BASE, LIBXL_QEMU_USER_SHARED); -+ *intended_uid = user_base->pw_uid; -+ -+ return 0; -+ } -+ -+ LOGD(ERROR, guest_domid, -+ "Could not find user %s or range base pseudo-user %s, cannot restrict", -+ LIBXL_QEMU_USER_SHARED, LIBXL_QEMU_USER_RANGE_BASE); -+ -+ return ERROR_INVAL; -+} -+ -+static int make_bootloader_args(libxl__gc *gc, libxl__bootloader_state *bl, -+ const char *bootloader_path) - { - const libxl_domain_build_info *info = bl->info; - -@@ -61,6 +125,23 @@ static void make_bootloader_args(libxl__gc *gc, libxl__bootloader_state *bl, - ARG(GCSPRINTF("--ramdisk=%s", info->ramdisk)); - if (info->cmdline && *info->cmdline != '\0') - ARG(GCSPRINTF("--args=%s", info->cmdline)); -+ if (getenv("LIBXL_BOOTLOADER_RESTRICT") || -+ getenv("LIBXL_BOOTLOADER_USER")) { -+ uid_t uid = -1; -+ int rc = bootloader_uid(gc, bl->domid, getenv("LIBXL_BOOTLOADER_USER"), -+ &uid); -+ -+ if (rc) return rc; -+ -+ assert(uid != -1); -+ if (!uid) { -+ LOGD(ERROR, bl->domid, "bootloader restrict UID is 0 (root)!"); -+ return ERROR_INVAL; -+ } -+ LOGD(DEBUG, bl->domid, "using uid %ld", (long)uid); -+ ARG(GCSPRINTF("--runas=%ld", (long)uid)); -+ ARG("--quiet"); -+ } - - ARG(GCSPRINTF("--output=%s", bl->outputpath)); - ARG("--output-format=simple0"); -@@ -79,6 +160,7 @@ static void make_bootloader_args(libxl__gc *gc, libxl__bootloader_state *bl, - /* Sentinel for execv */ - ARG(NULL); - -+ return 0; - #undef ARG - } - -@@ -443,7 +525,8 @@ static void bootloader_disk_attached_cb(libxl__egc *egc, - bootloader = bltmp; - } - -- make_bootloader_args(gc, bl, bootloader); -+ rc = make_bootloader_args(gc, bl, bootloader); -+ if (rc) goto out; - - bl->openpty.ao = ao; - bl->openpty.callback = bootloader_gotptys; -diff --git a/tools/libs/light/libxl_dm.c b/tools/libs/light/libxl_dm.c -index fc264a3a13..14b593110f 100644 ---- a/tools/libs/light/libxl_dm.c -+++ b/tools/libs/light/libxl_dm.c -@@ -80,10 +80,10 @@ static int libxl__create_qemu_logfile(libxl__gc *gc, char *name) - * On error, return a libxl-style error code. - */ - #define DEFINE_USERLOOKUP_HELPER(NAME,SPEC_TYPE,STRUCTNAME,SYSCONF) \ -- static int userlookup_helper_##NAME(libxl__gc *gc, \ -- SPEC_TYPE spec, \ -- struct STRUCTNAME *resultbuf, \ -- struct STRUCTNAME **out) \ -+ int userlookup_helper_##NAME(libxl__gc *gc, \ -+ SPEC_TYPE spec, \ -+ struct STRUCTNAME *resultbuf, \ -+ struct STRUCTNAME **out) \ - { \ - struct STRUCTNAME *resultp = NULL; \ - char *buf = NULL; \ -diff --git a/tools/libs/light/libxl_internal.h b/tools/libs/light/libxl_internal.h -index 7ad38de30e..f1e3a9a15b 100644 ---- a/tools/libs/light/libxl_internal.h -+++ b/tools/libs/light/libxl_internal.h -@@ -4873,6 +4873,14 @@ struct libxl__cpu_policy { - struct xc_msr *msr; - }; - -+struct passwd; -+_hidden int userlookup_helper_getpwnam(libxl__gc*, const char *user, -+ struct passwd *res, -+ struct passwd **out); -+_hidden int userlookup_helper_getpwuid(libxl__gc*, uid_t uid, -+ struct passwd *res, -+ struct passwd **out); -+ - #endif - - /* --- -2.42.0 - diff --git a/0053-libxl-limit-bootloader-execution-in-restricted-mode.patch b/0053-libxl-limit-bootloader-execution-in-restricted-mode.patch deleted file mode 100644 index 8c790d3..0000000 --- a/0053-libxl-limit-bootloader-execution-in-restricted-mode.patch +++ /dev/null @@ -1,158 +0,0 @@ -From 46d00dbf4c22b28910f73f66a03e5cabe50b5395 Mon Sep 17 00:00:00 2001 -From: Roger Pau Monne <roger.pau@citrix.com> -Date: Thu, 28 Sep 2023 12:22:35 +0200 -Subject: [PATCH 53/55] libxl: limit bootloader execution in restricted mode -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Introduce a timeout for bootloader execution when running in restricted mode. - -Allow overwriting the default time out with an environment provided value. - -This is part of XSA-443 / CVE-2023-34325 - -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: Anthony PERARD <anthony.perard@citrix.com> -(cherry picked from commit 9c114178ffd700112e91f5ec66cf5151b9c9a8cc) ---- - docs/man/xl.1.pod.in | 8 ++++++ - tools/libs/light/libxl_bootloader.c | 40 +++++++++++++++++++++++++++++ - tools/libs/light/libxl_internal.h | 2 ++ - 3 files changed, 50 insertions(+) - -diff --git a/docs/man/xl.1.pod.in b/docs/man/xl.1.pod.in -index 4831e12242..c3eb6570ab 100644 ---- a/docs/man/xl.1.pod.in -+++ b/docs/man/xl.1.pod.in -@@ -1988,6 +1988,14 @@ NOTE: Each domain MUST have a SEPARATE username. - - See docs/features/qemu-deprivilege.pandoc for more information. - -+=item LIBXL_BOOTLOADER_TIMEOUT -+ -+Timeout in seconds for bootloader execution when running in restricted mode. -+Otherwise the build time default in LIBXL_BOOTLOADER_TIMEOUT will be used. -+ -+If defined the value must be an unsigned integer between 0 and INT_MAX, -+otherwise behavior is undefined. Setting to 0 disables the timeout. -+ - =back - - =head1 SEE ALSO -diff --git a/tools/libs/light/libxl_bootloader.c b/tools/libs/light/libxl_bootloader.c -index 23c0ef3e89..ee26d08f37 100644 ---- a/tools/libs/light/libxl_bootloader.c -+++ b/tools/libs/light/libxl_bootloader.c -@@ -30,6 +30,8 @@ static void bootloader_keystrokes_copyfail(libxl__egc *egc, - libxl__datacopier_state *dc, int rc, int onwrite, int errnoval); - static void bootloader_display_copyfail(libxl__egc *egc, - libxl__datacopier_state *dc, int rc, int onwrite, int errnoval); -+static void bootloader_timeout(libxl__egc *egc, libxl__ev_time *ev, -+ const struct timeval *requested_abs, int rc); - static void bootloader_domaindeath(libxl__egc*, libxl__domaindeathcheck *dc, - int rc); - static void bootloader_finished(libxl__egc *egc, libxl__ev_child *child, -@@ -297,6 +299,7 @@ void libxl__bootloader_init(libxl__bootloader_state *bl) - bl->ptys[0].master = bl->ptys[0].slave = 0; - bl->ptys[1].master = bl->ptys[1].slave = 0; - libxl__ev_child_init(&bl->child); -+ libxl__ev_time_init(&bl->time); - libxl__domaindeathcheck_init(&bl->deathcheck); - bl->keystrokes.ao = bl->ao; libxl__datacopier_init(&bl->keystrokes); - bl->display.ao = bl->ao; libxl__datacopier_init(&bl->display); -@@ -314,6 +317,7 @@ static void bootloader_cleanup(libxl__egc *egc, libxl__bootloader_state *bl) - libxl__domaindeathcheck_stop(gc,&bl->deathcheck); - libxl__datacopier_kill(&bl->keystrokes); - libxl__datacopier_kill(&bl->display); -+ libxl__ev_time_deregister(gc, &bl->time); - for (i=0; i<2; i++) { - libxl__carefd_close(bl->ptys[i].master); - libxl__carefd_close(bl->ptys[i].slave); -@@ -375,6 +379,7 @@ static void bootloader_stop(libxl__egc *egc, - - libxl__datacopier_kill(&bl->keystrokes); - libxl__datacopier_kill(&bl->display); -+ libxl__ev_time_deregister(gc, &bl->time); - if (libxl__ev_child_inuse(&bl->child)) { - r = kill(bl->child.pid, SIGTERM); - if (r) LOGED(WARN, bl->domid, "%sfailed to kill bootloader [%lu]", -@@ -637,6 +642,25 @@ static void bootloader_gotptys(libxl__egc *egc, libxl__openpty_state *op) - - struct termios termattr; - -+ if (getenv("LIBXL_BOOTLOADER_RESTRICT") || -+ getenv("LIBXL_BOOTLOADER_USER")) { -+ const char *timeout_env = getenv("LIBXL_BOOTLOADER_TIMEOUT"); -+ int timeout = timeout_env ? atoi(timeout_env) -+ : LIBXL_BOOTLOADER_TIMEOUT; -+ -+ if (timeout) { -+ /* Set execution timeout */ -+ rc = libxl__ev_time_register_rel(ao, &bl->time, -+ bootloader_timeout, -+ timeout * 1000); -+ if (rc) { -+ LOGED(ERROR, bl->domid, -+ "unable to register timeout for bootloader execution"); -+ goto out; -+ } -+ } -+ } -+ - pid_t pid = libxl__ev_child_fork(gc, &bl->child, bootloader_finished); - if (pid == -1) { - rc = ERROR_FAIL; -@@ -702,6 +726,21 @@ static void bootloader_display_copyfail(libxl__egc *egc, - libxl__bootloader_state *bl = CONTAINER_OF(dc, *bl, display); - bootloader_copyfail(egc, "bootloader output", bl, 1, rc,onwrite,errnoval); - } -+static void bootloader_timeout(libxl__egc *egc, libxl__ev_time *ev, -+ const struct timeval *requested_abs, int rc) -+{ -+ libxl__bootloader_state *bl = CONTAINER_OF(ev, *bl, time); -+ STATE_AO_GC(bl->ao); -+ -+ libxl__ev_time_deregister(gc, &bl->time); -+ -+ assert(libxl__ev_child_inuse(&bl->child)); -+ LOGD(ERROR, bl->domid, "killing bootloader because of timeout"); -+ -+ libxl__ev_child_kill_deregister(ao, &bl->child, SIGKILL); -+ -+ bootloader_callback(egc, bl, rc); -+} - - static void bootloader_domaindeath(libxl__egc *egc, - libxl__domaindeathcheck *dc, -@@ -718,6 +757,7 @@ static void bootloader_finished(libxl__egc *egc, libxl__ev_child *child, - STATE_AO_GC(bl->ao); - int rc; - -+ libxl__ev_time_deregister(gc, &bl->time); - libxl__datacopier_kill(&bl->keystrokes); - libxl__datacopier_kill(&bl->display); - -diff --git a/tools/libs/light/libxl_internal.h b/tools/libs/light/libxl_internal.h -index f1e3a9a15b..d05783617f 100644 ---- a/tools/libs/light/libxl_internal.h -+++ b/tools/libs/light/libxl_internal.h -@@ -102,6 +102,7 @@ - #define LIBXL_QMP_CMD_TIMEOUT 10 - #define LIBXL_STUBDOM_START_TIMEOUT 30 - #define LIBXL_QEMU_BODGE_TIMEOUT 2 -+#define LIBXL_BOOTLOADER_TIMEOUT 120 - #define LIBXL_XENCONSOLE_LIMIT 1048576 - #define LIBXL_XENCONSOLE_PROTOCOL "vt100" - #define LIBXL_MAXMEM_CONSTANT 1024 -@@ -3744,6 +3745,7 @@ struct libxl__bootloader_state { - libxl__openpty_state openpty; - libxl__openpty_result ptys[2]; /* [0] is for bootloader */ - libxl__ev_child child; -+ libxl__ev_time time; - libxl__domaindeathcheck deathcheck; - int nargs, argsspace; - const char **args; --- -2.42.0 - diff --git a/0054-x86-svm-Fix-asymmetry-with-AMD-DR-MASK-context-switc.patch b/0054-x86-svm-Fix-asymmetry-with-AMD-DR-MASK-context-switc.patch deleted file mode 100644 index af72c9a..0000000 --- a/0054-x86-svm-Fix-asymmetry-with-AMD-DR-MASK-context-switc.patch +++ /dev/null @@ -1,104 +0,0 @@ -From 3f8b444072fd8615288d9d11e53fbf0b6a8a7750 Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Tue, 26 Sep 2023 20:03:36 +0100 -Subject: [PATCH 54/55] x86/svm: Fix asymmetry with AMD DR MASK context - switching - -The handling of MSR_DR{0..3}_MASK is asymmetric between PV and HVM guests. - -HVM guests context switch in based on the guest view of DBEXT, whereas PV -guest switch in base on the host capability. Both guest types leave the -context dirty for the next vCPU. - -This leads to the following issue: - - * PV or HVM vCPU has debugging active (%dr7 + mask) - * Switch out deactivates %dr7 but leaves other state stale in hardware - * HVM vCPU with debugging activate but can't see DBEXT is switched in - * Switch in loads %dr7 but leaves the mask MSRs alone - -Now, the HVM vCPU is operating in the context of the prior vCPU's mask MSR, -and furthermore in a case where it genuinely expects there to be no masking -MSRs. - -As a stopgap, adjust the HVM path to switch in/out the masks based on host -capabilities rather than guest visibility (i.e. like the PV path). Adjustment -of the of the intercepts still needs to be dependent on the guest visibility -of DBEXT. - -This is part of XSA-444 / CVE-2023-34327 - -Fixes: c097f54912d3 ("x86/SVM: support data breakpoint extension registers") -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -(cherry picked from commit 5d54282f984bb9a7a65b3d12208584f9fdf1c8e1) ---- - xen/arch/x86/hvm/svm/svm.c | 24 ++++++++++++++++++------ - xen/arch/x86/traps.c | 5 +++++ - 2 files changed, 23 insertions(+), 6 deletions(-) - -diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c -index e8f50e7c5e..fd32600ae3 100644 ---- a/xen/arch/x86/hvm/svm/svm.c -+++ b/xen/arch/x86/hvm/svm/svm.c -@@ -339,6 +339,10 @@ static void svm_save_dr(struct vcpu *v) - v->arch.hvm.flag_dr_dirty = 0; - vmcb_set_dr_intercepts(vmcb, ~0u); - -+ /* -+ * The guest can only have changed the mask MSRs if we previous dropped -+ * intercepts. Re-read them from hardware. -+ */ - if ( v->domain->arch.cpuid->extd.dbext ) - { - svm_intercept_msr(v, MSR_AMD64_DR0_ADDRESS_MASK, MSR_INTERCEPT_RW); -@@ -370,17 +374,25 @@ static void __restore_debug_registers(struct vmcb_struct *vmcb, struct vcpu *v) - - ASSERT(v == current); - -- if ( v->domain->arch.cpuid->extd.dbext ) -+ /* -+ * Both the PV and HVM paths leave stale DR_MASK values in hardware on -+ * context-switch-out. If we're activating %dr7 for the guest, we must -+ * sync the DR_MASKs too, whether or not the guest can see them. -+ */ -+ if ( boot_cpu_has(X86_FEATURE_DBEXT) ) - { -- svm_intercept_msr(v, MSR_AMD64_DR0_ADDRESS_MASK, MSR_INTERCEPT_NONE); -- svm_intercept_msr(v, MSR_AMD64_DR1_ADDRESS_MASK, MSR_INTERCEPT_NONE); -- svm_intercept_msr(v, MSR_AMD64_DR2_ADDRESS_MASK, MSR_INTERCEPT_NONE); -- svm_intercept_msr(v, MSR_AMD64_DR3_ADDRESS_MASK, MSR_INTERCEPT_NONE); -- - wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, v->arch.msrs->dr_mask[0]); - wrmsrl(MSR_AMD64_DR1_ADDRESS_MASK, v->arch.msrs->dr_mask[1]); - wrmsrl(MSR_AMD64_DR2_ADDRESS_MASK, v->arch.msrs->dr_mask[2]); - wrmsrl(MSR_AMD64_DR3_ADDRESS_MASK, v->arch.msrs->dr_mask[3]); -+ -+ if ( v->domain->arch.cpuid->extd.dbext ) -+ { -+ svm_intercept_msr(v, MSR_AMD64_DR0_ADDRESS_MASK, MSR_INTERCEPT_NONE); -+ svm_intercept_msr(v, MSR_AMD64_DR1_ADDRESS_MASK, MSR_INTERCEPT_NONE); -+ svm_intercept_msr(v, MSR_AMD64_DR2_ADDRESS_MASK, MSR_INTERCEPT_NONE); -+ svm_intercept_msr(v, MSR_AMD64_DR3_ADDRESS_MASK, MSR_INTERCEPT_NONE); -+ } - } - - write_debugreg(0, v->arch.dr[0]); -diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c -index e65cc60041..06c4f3868b 100644 ---- a/xen/arch/x86/traps.c -+++ b/xen/arch/x86/traps.c -@@ -2281,6 +2281,11 @@ void activate_debugregs(const struct vcpu *curr) - if ( curr->arch.dr7 & DR7_ACTIVE_MASK ) - write_debugreg(7, curr->arch.dr7); - -+ /* -+ * Both the PV and HVM paths leave stale DR_MASK values in hardware on -+ * context-switch-out. If we're activating %dr7 for the guest, we must -+ * sync the DR_MASKs too, whether or not the guest can see them. -+ */ - if ( boot_cpu_has(X86_FEATURE_DBEXT) ) - { - wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, curr->arch.msrs->dr_mask[0]); --- -2.42.0 - diff --git a/0055-x86-pv-Correct-the-auditing-of-guest-breakpoint-addr.patch b/0055-x86-pv-Correct-the-auditing-of-guest-breakpoint-addr.patch deleted file mode 100644 index 5838e7f..0000000 --- a/0055-x86-pv-Correct-the-auditing-of-guest-breakpoint-addr.patch +++ /dev/null @@ -1,86 +0,0 @@ -From 0b56bed864ca9b572473957f0254aefa797216f2 Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Tue, 26 Sep 2023 20:03:36 +0100 -Subject: [PATCH 55/55] x86/pv: Correct the auditing of guest breakpoint - addresses -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -The use of access_ok() is buggy, because it permits access to the compat -translation area. 64bit PV guests don't use the XLAT area, but on AMD -hardware, the DBEXT feature allows a breakpoint to match up to a 4G aligned -region, allowing the breakpoint to reach outside of the XLAT area. - -Prior to c/s cda16c1bb223 ("x86: mirror compat argument translation area for -32-bit PV"), the live GDT was within 4G of the XLAT area. - -All together, this allowed a malicious 64bit PV guest on AMD hardware to place -a breakpoint over the live GDT, and trigger a #DB livelock (CVE-2015-8104). - -Introduce breakpoint_addr_ok() and explain why __addr_ok() happens to be an -appropriate check in this case. - -For Xen 4.14 and later, this is a latent bug because the XLAT area has moved -to be on its own with nothing interesting adjacent. For Xen 4.13 and older on -AMD hardware, this fixes a PV-trigger-able DoS. - -This is part of XSA-444 / CVE-2023-34328. - -Fixes: 65e355490817 ("x86/PV: support data breakpoint extension registers") -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit dc9d9aa62ddeb14abd5672690d30789829f58f7e) ---- - xen/arch/x86/include/asm/debugreg.h | 20 ++++++++++++++++++++ - xen/arch/x86/pv/misc-hypercalls.c | 2 +- - 2 files changed, 21 insertions(+), 1 deletion(-) - -diff --git a/xen/arch/x86/include/asm/debugreg.h b/xen/arch/x86/include/asm/debugreg.h -index c57914efc6..cc29826524 100644 ---- a/xen/arch/x86/include/asm/debugreg.h -+++ b/xen/arch/x86/include/asm/debugreg.h -@@ -77,6 +77,26 @@ - asm volatile ( "mov %%db" #reg ",%0" : "=r" (__val) ); \ - __val; \ - }) -+ -+/* -+ * Architecturally, %dr{0..3} can have any arbitrary value. However, Xen -+ * can't allow the guest to breakpoint the Xen address range, so we limit the -+ * guest to the lower canonical half, or above the Xen range in the higher -+ * canonical half. -+ * -+ * Breakpoint lengths are specified to mask the low order address bits, -+ * meaning all breakpoints are naturally aligned. With %dr7, the widest -+ * breakpoint is 8 bytes. With DBEXT, the widest breakpoint is 4G. Both of -+ * the Xen boundaries have >4G alignment. -+ * -+ * In principle we should account for HYPERVISOR_COMPAT_VIRT_START(d), but -+ * 64bit Xen has never enforced this for compat guests, and there's no problem -+ * (to Xen) if the guest breakpoints it's alias of the M2P. Skipping this -+ * aspect simplifies the logic, and causes us not to reject a migrating guest -+ * which operated fine on prior versions of Xen. -+ */ -+#define breakpoint_addr_ok(a) __addr_ok(a) -+ - long set_debugreg(struct vcpu *, unsigned int reg, unsigned long value); - void activate_debugregs(const struct vcpu *); - -diff --git a/xen/arch/x86/pv/misc-hypercalls.c b/xen/arch/x86/pv/misc-hypercalls.c -index aaaf70eb63..f8636de907 100644 ---- a/xen/arch/x86/pv/misc-hypercalls.c -+++ b/xen/arch/x86/pv/misc-hypercalls.c -@@ -72,7 +72,7 @@ long set_debugreg(struct vcpu *v, unsigned int reg, unsigned long value) - switch ( reg ) - { - case 0 ... 3: -- if ( !access_ok(value, sizeof(long)) ) -+ if ( !breakpoint_addr_ok(value) ) - return -EPERM; - - v->arch.dr[reg] = value; --- -2.42.0 - @@ -1,6 +1,6 @@ -Xen upstream patchset #0 for 4.17.3-pre +Xen upstream patchset #0 for 4.17.4-pre Containing patches from -RELEASE-4.17.2 (b86c313a4a9c3ec4c9f825d9b99131753296485f) +RELEASE-4.17.3 (07f413d7ffb06eab36045bd19f53555de1cacf62) to -staging-4.17 (0b56bed864ca9b572473957f0254aefa797216f2) +staging-4.17 (091466ba55d1e2e75738f751818ace2e3ed08ccf) |