diff options
148 files changed, 5420 insertions, 13296 deletions
diff --git a/0001-update-Xen-version-to-4.16.3-pre.patch b/0001-update-Xen-version-to-4.16.4-pre.patch index d04dd34..961358a 100644 --- a/0001-update-Xen-version-to-4.16.3-pre.patch +++ b/0001-update-Xen-version-to-4.16.4-pre.patch @@ -1,25 +1,25 @@ -From 4aa32912ebeda8cb94d1c3941e7f1f0a2d4f921b Mon Sep 17 00:00:00 2001 +From e3396cd8be5ee99d363a23f30c680e42fb2757bd Mon Sep 17 00:00:00 2001 From: Jan Beulich <jbeulich@suse.com> -Date: Tue, 11 Oct 2022 14:49:41 +0200 -Subject: [PATCH 01/87] update Xen version to 4.16.3-pre +Date: Tue, 20 Dec 2022 13:50:16 +0100 +Subject: [PATCH 01/61] update Xen version to 4.16.4-pre --- xen/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xen/Makefile b/xen/Makefile -index 76d0a3ff253f..8a403ee896cd 100644 +index 06dde1e03c..67c5551ffd 100644 --- a/xen/Makefile +++ b/xen/Makefile @@ -2,7 +2,7 @@ # All other places this is stored (eg. compile.h) should be autogenerated. export XEN_VERSION = 4 export XEN_SUBVERSION = 16 --export XEN_EXTRAVERSION ?= .2$(XEN_VENDORVERSION) -+export XEN_EXTRAVERSION ?= .3-pre$(XEN_VENDORVERSION) +-export XEN_EXTRAVERSION ?= .3$(XEN_VENDORVERSION) ++export XEN_EXTRAVERSION ?= .4-pre$(XEN_VENDORVERSION) export XEN_FULLVERSION = $(XEN_VERSION).$(XEN_SUBVERSION)$(XEN_EXTRAVERSION) -include xen-version -- -2.37.4 +2.40.0 diff --git a/0002-ioreq_broadcast-accept-partial-broadcast-success.patch b/0002-ioreq_broadcast-accept-partial-broadcast-success.patch new file mode 100644 index 0000000..1b0ae9c --- /dev/null +++ b/0002-ioreq_broadcast-accept-partial-broadcast-success.patch @@ -0,0 +1,34 @@ +From f2edbd79f5d5ce3b633885469852e1215dc0d4b5 Mon Sep 17 00:00:00 2001 +From: Per Bilse <per.bilse@citrix.com> +Date: Tue, 20 Dec 2022 13:50:47 +0100 +Subject: [PATCH 02/61] ioreq_broadcast(): accept partial broadcast success + +Avoid incorrectly triggering an error when a broadcast buffered ioreq +is not handled by all registered clients, as long as the failure is +strictly because the client doesn't handle buffered ioreqs. + +Signed-off-by: Per Bilse <per.bilse@citrix.com> +Reviewed-by: Paul Durrant <paul@xen.org> +master commit: a44734df6c24fadbdb001f051cc5580c467caf7d +master date: 2022-12-07 12:17:30 +0100 +--- + xen/common/ioreq.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/xen/common/ioreq.c b/xen/common/ioreq.c +index 42414b750b..2a8d8de2d5 100644 +--- a/xen/common/ioreq.c ++++ b/xen/common/ioreq.c +@@ -1322,7 +1322,8 @@ unsigned int ioreq_broadcast(ioreq_t *p, bool buffered) + + FOR_EACH_IOREQ_SERVER(d, id, s) + { +- if ( !s->enabled ) ++ if ( !s->enabled || ++ (buffered && s->bufioreq_handling == HVM_IOREQSRV_BUFIOREQ_OFF) ) + continue; + + if ( ioreq_send(s, p, buffered) == IOREQ_STATUS_UNHANDLED ) +-- +2.40.0 + diff --git a/0002-xen-arm-p2m-Prevent-adding-mapping-when-domain-is-dy.patch b/0002-xen-arm-p2m-Prevent-adding-mapping-when-domain-is-dy.patch deleted file mode 100644 index 63aa293..0000000 --- a/0002-xen-arm-p2m-Prevent-adding-mapping-when-domain-is-dy.patch +++ /dev/null @@ -1,62 +0,0 @@ -From 8d9531a3421dad2b0012e09e6f41d5274e162064 Mon Sep 17 00:00:00 2001 -From: Julien Grall <jgrall@amazon.com> -Date: Tue, 11 Oct 2022 14:52:13 +0200 -Subject: [PATCH 02/87] xen/arm: p2m: Prevent adding mapping when domain is - dying - -During the domain destroy process, the domain will still be accessible -until it is fully destroyed. So does the P2M because we don't bail -out early if is_dying is non-zero. If a domain has permission to -modify the other domain's P2M (i.e. dom0, or a stubdomain), then -foreign mapping can be added past relinquish_p2m_mapping(). - -Therefore, we need to prevent mapping to be added when the domain -is dying. This commit prevents such adding of mapping by adding the -d->is_dying check to p2m_set_entry(). Also this commit enhances the -check in relinquish_p2m_mapping() to make sure that no mappings can -be added in the P2M after the P2M lock is released. - -This is part of CVE-2022-33746 / XSA-410. - -Signed-off-by: Julien Grall <jgrall@amazon.com> -Signed-off-by: Henry Wang <Henry.Wang@arm.com> -Tested-by: Henry Wang <Henry.Wang@arm.com> -Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> -master commit: 3ebe773293e3b945460a3d6f54f3b91915397bab -master date: 2022-10-11 14:20:18 +0200 ---- - xen/arch/arm/p2m.c | 11 +++++++++++ - 1 file changed, 11 insertions(+) - -diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c -index 3349b464a39e..1affdafadbeb 100644 ---- a/xen/arch/arm/p2m.c -+++ b/xen/arch/arm/p2m.c -@@ -1093,6 +1093,15 @@ int p2m_set_entry(struct p2m_domain *p2m, - { - int rc = 0; - -+ /* -+ * Any reference taken by the P2M mappings (e.g. foreign mapping) will -+ * be dropped in relinquish_p2m_mapping(). As the P2M will still -+ * be accessible after, we need to prevent mapping to be added when the -+ * domain is dying. -+ */ -+ if ( unlikely(p2m->domain->is_dying) ) -+ return -ENOMEM; -+ - while ( nr ) - { - unsigned long mask; -@@ -1610,6 +1619,8 @@ int relinquish_p2m_mapping(struct domain *d) - unsigned int order; - gfn_t start, end; - -+ BUG_ON(!d->is_dying); -+ /* No mappings can be added in the P2M after the P2M lock is released. */ - p2m_write_lock(p2m); - - start = p2m->lowest_mapped_gfn; --- -2.37.4 - diff --git a/0003-x86-time-prevent-overflow-with-high-frequency-TSCs.patch b/0003-x86-time-prevent-overflow-with-high-frequency-TSCs.patch new file mode 100644 index 0000000..a031317 --- /dev/null +++ b/0003-x86-time-prevent-overflow-with-high-frequency-TSCs.patch @@ -0,0 +1,34 @@ +From 65bf12135f618614bbf44626fba1c20ca8d1a127 Mon Sep 17 00:00:00 2001 +From: Neowutran <xen@neowutran.ovh> +Date: Tue, 20 Dec 2022 13:51:42 +0100 +Subject: [PATCH 03/61] x86/time: prevent overflow with high frequency TSCs + +Make sure tsc_khz is promoted to a 64-bit type before multiplying by +1000 to avoid an 'overflow before widen' bug. Otherwise just above +4.294GHz the value will overflow. Processors with clocks this high are +now in production and require this to work correctly. + +Signed-off-by: Neowutran <xen@neowutran.ovh> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +master commit: ad15a0a8ca2515d8ac58edfc0bc1d3719219cb77 +master date: 2022-12-19 11:34:16 +0100 +--- + xen/arch/x86/time.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xen/arch/x86/time.c b/xen/arch/x86/time.c +index 1daff92dca..db0b149ec6 100644 +--- a/xen/arch/x86/time.c ++++ b/xen/arch/x86/time.c +@@ -2490,7 +2490,7 @@ int tsc_set_info(struct domain *d, + case TSC_MODE_ALWAYS_EMULATE: + d->arch.vtsc_offset = get_s_time() - elapsed_nsec; + d->arch.tsc_khz = gtsc_khz ?: cpu_khz; +- set_time_scale(&d->arch.vtsc_to_ns, d->arch.tsc_khz * 1000); ++ set_time_scale(&d->arch.vtsc_to_ns, d->arch.tsc_khz * 1000UL); + + /* + * In default mode use native TSC if the host has safe TSC and +-- +2.40.0 + diff --git a/0003-xen-arm-p2m-Handle-preemption-when-freeing-intermedi.patch b/0003-xen-arm-p2m-Handle-preemption-when-freeing-intermedi.patch deleted file mode 100644 index 0b33b0a..0000000 --- a/0003-xen-arm-p2m-Handle-preemption-when-freeing-intermedi.patch +++ /dev/null @@ -1,167 +0,0 @@ -From 937fdbad5180440888f1fcee46299103327efa90 Mon Sep 17 00:00:00 2001 -From: Julien Grall <jgrall@amazon.com> -Date: Tue, 11 Oct 2022 14:52:27 +0200 -Subject: [PATCH 03/87] xen/arm: p2m: Handle preemption when freeing - intermediate page tables - -At the moment the P2M page tables will be freed when the domain structure -is freed without any preemption. As the P2M is quite large, iterating -through this may take more time than it is reasonable without intermediate -preemption (to run softirqs and perhaps scheduler). - -Split p2m_teardown() in two parts: one preemptible and called when -relinquishing the resources, the other one non-preemptible and called -when freeing the domain structure. - -As we are now freeing the P2M pages early, we also need to prevent -further allocation if someone call p2m_set_entry() past p2m_teardown() -(I wasn't able to prove this will never happen). This is done by -the checking domain->is_dying from previous patch in p2m_set_entry(). - -Similarly, we want to make sure that no-one can accessed the free -pages. Therefore the root is cleared before freeing pages. - -This is part of CVE-2022-33746 / XSA-410. - -Signed-off-by: Julien Grall <jgrall@amazon.com> -Signed-off-by: Henry Wang <Henry.Wang@arm.com> -Tested-by: Henry Wang <Henry.Wang@arm.com> -Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> -master commit: 3202084566bba0ef0c45caf8c24302f83d92f9c8 -master date: 2022-10-11 14:20:56 +0200 ---- - xen/arch/arm/domain.c | 10 +++++++-- - xen/arch/arm/p2m.c | 47 ++++++++++++++++++++++++++++++++++++--- - xen/include/asm-arm/p2m.h | 13 +++++++++-- - 3 files changed, 63 insertions(+), 7 deletions(-) - -diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c -index 96e1b235501d..2694c39127c5 100644 ---- a/xen/arch/arm/domain.c -+++ b/xen/arch/arm/domain.c -@@ -789,10 +789,10 @@ fail: - void arch_domain_destroy(struct domain *d) - { - /* IOMMU page table is shared with P2M, always call -- * iommu_domain_destroy() before p2m_teardown(). -+ * iommu_domain_destroy() before p2m_final_teardown(). - */ - iommu_domain_destroy(d); -- p2m_teardown(d); -+ p2m_final_teardown(d); - domain_vgic_free(d); - domain_vuart_free(d); - free_xenheap_page(d->shared_info); -@@ -996,6 +996,7 @@ enum { - PROG_xen, - PROG_page, - PROG_mapping, -+ PROG_p2m, - PROG_done, - }; - -@@ -1056,6 +1057,11 @@ int domain_relinquish_resources(struct domain *d) - if ( ret ) - return ret; - -+ PROGRESS(p2m): -+ ret = p2m_teardown(d); -+ if ( ret ) -+ return ret; -+ - PROGRESS(done): - break; - -diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c -index 1affdafadbeb..27418ee5ee98 100644 ---- a/xen/arch/arm/p2m.c -+++ b/xen/arch/arm/p2m.c -@@ -1527,17 +1527,58 @@ static void p2m_free_vmid(struct domain *d) - spin_unlock(&vmid_alloc_lock); - } - --void p2m_teardown(struct domain *d) -+int p2m_teardown(struct domain *d) - { - struct p2m_domain *p2m = p2m_get_hostp2m(d); -+ unsigned long count = 0; - struct page_info *pg; -+ unsigned int i; -+ int rc = 0; -+ -+ p2m_write_lock(p2m); -+ -+ /* -+ * We are about to free the intermediate page-tables, so clear the -+ * root to prevent any walk to use them. -+ */ -+ for ( i = 0; i < P2M_ROOT_PAGES; i++ ) -+ clear_and_clean_page(p2m->root + i); -+ -+ /* -+ * The domain will not be scheduled anymore, so in theory we should -+ * not need to flush the TLBs. Do it for safety purpose. -+ * -+ * Note that all the devices have already been de-assigned. So we don't -+ * need to flush the IOMMU TLB here. -+ */ -+ p2m_force_tlb_flush_sync(p2m); -+ -+ while ( (pg = page_list_remove_head(&p2m->pages)) ) -+ { -+ free_domheap_page(pg); -+ count++; -+ /* Arbitrarily preempt every 512 iterations */ -+ if ( !(count % 512) && hypercall_preempt_check() ) -+ { -+ rc = -ERESTART; -+ break; -+ } -+ } -+ -+ p2m_write_unlock(p2m); -+ -+ return rc; -+} -+ -+void p2m_final_teardown(struct domain *d) -+{ -+ struct p2m_domain *p2m = p2m_get_hostp2m(d); - - /* p2m not actually initialized */ - if ( !p2m->domain ) - return; - -- while ( (pg = page_list_remove_head(&p2m->pages)) ) -- free_domheap_page(pg); -+ ASSERT(page_list_empty(&p2m->pages)); - - if ( p2m->root ) - free_domheap_pages(p2m->root, P2M_ROOT_ORDER); -diff --git a/xen/include/asm-arm/p2m.h b/xen/include/asm-arm/p2m.h -index 8f11d9c97b5d..b3ba83283e11 100644 ---- a/xen/include/asm-arm/p2m.h -+++ b/xen/include/asm-arm/p2m.h -@@ -192,8 +192,17 @@ void setup_virt_paging(void); - /* Init the datastructures for later use by the p2m code */ - int p2m_init(struct domain *d); - --/* Return all the p2m resources to Xen. */ --void p2m_teardown(struct domain *d); -+/* -+ * The P2M resources are freed in two parts: -+ * - p2m_teardown() will be called when relinquish the resources. It -+ * will free large resources (e.g. intermediate page-tables) that -+ * requires preemption. -+ * - p2m_final_teardown() will be called when domain struct is been -+ * freed. This *cannot* be preempted and therefore one small -+ * resources should be freed here. -+ */ -+int p2m_teardown(struct domain *d); -+void p2m_final_teardown(struct domain *d); - - /* - * Remove mapping refcount on each mapping page in the p2m --- -2.37.4 - diff --git a/0004-x86-S3-Restore-Xen-s-MSR_PAT-value-on-S3-resume.patch b/0004-x86-S3-Restore-Xen-s-MSR_PAT-value-on-S3-resume.patch new file mode 100644 index 0000000..3d1c089 --- /dev/null +++ b/0004-x86-S3-Restore-Xen-s-MSR_PAT-value-on-S3-resume.patch @@ -0,0 +1,36 @@ +From 7b1b9849e8a0d7791866d6d21c45993dfe27836c Mon Sep 17 00:00:00 2001 +From: Andrew Cooper <andrew.cooper3@citrix.com> +Date: Tue, 7 Feb 2023 17:03:09 +0100 +Subject: [PATCH 04/61] x86/S3: Restore Xen's MSR_PAT value on S3 resume + +There are two paths in the trampoline, and Xen's PAT needs setting up in both, +not just the boot path. + +Fixes: 4304ff420e51 ("x86/S3: Drop {save,restore}_rest_processor_state() completely") +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +master commit: 4d975798e11579fdf405b348543061129e01b0fb +master date: 2023-01-10 21:21:30 +0000 +--- + xen/arch/x86/boot/wakeup.S | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/xen/arch/x86/boot/wakeup.S b/xen/arch/x86/boot/wakeup.S +index c17d613b61..08447e1934 100644 +--- a/xen/arch/x86/boot/wakeup.S ++++ b/xen/arch/x86/boot/wakeup.S +@@ -130,6 +130,11 @@ wakeup_32: + and %edi, %edx + wrmsr + 1: ++ /* Set up PAT before enabling paging. */ ++ mov $XEN_MSR_PAT & 0xffffffff, %eax ++ mov $XEN_MSR_PAT >> 32, %edx ++ mov $MSR_IA32_CR_PAT, %ecx ++ wrmsr + + /* Set up EFER (Extended Feature Enable Register). */ + movl $MSR_EFER,%ecx +-- +2.40.0 + diff --git a/0004-x86-p2m-add-option-to-skip-root-pagetable-removal-in.patch b/0004-x86-p2m-add-option-to-skip-root-pagetable-removal-in.patch deleted file mode 100644 index 04c002b..0000000 --- a/0004-x86-p2m-add-option-to-skip-root-pagetable-removal-in.patch +++ /dev/null @@ -1,138 +0,0 @@ -From 8fc19c143b8aa563077f3d5c46fcc0a54dc04f35 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Tue, 11 Oct 2022 14:52:39 +0200 -Subject: [PATCH 04/87] x86/p2m: add option to skip root pagetable removal in - p2m_teardown() -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Add a new parameter to p2m_teardown() in order to select whether the -root page table should also be freed. Note that all users are -adjusted to pass the parameter to remove the root page tables, so -behavior is not modified. - -No functional change intended. - -This is part of CVE-2022-33746 / XSA-410. - -Suggested-by: Julien Grall <julien@xen.org> -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Tim Deegan <tim@xen.org> -master commit: 1df52a270225527ae27bfa2fc40347bf93b78357 -master date: 2022-10-11 14:21:23 +0200 ---- - xen/arch/x86/mm/hap/hap.c | 6 +++--- - xen/arch/x86/mm/p2m.c | 20 ++++++++++++++++---- - xen/arch/x86/mm/shadow/common.c | 4 ++-- - xen/include/asm-x86/p2m.h | 2 +- - 4 files changed, 22 insertions(+), 10 deletions(-) - -diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c -index 47a7487fa7a3..a8f5a19da917 100644 ---- a/xen/arch/x86/mm/hap/hap.c -+++ b/xen/arch/x86/mm/hap/hap.c -@@ -541,18 +541,18 @@ void hap_final_teardown(struct domain *d) - } - - for ( i = 0; i < MAX_ALTP2M; i++ ) -- p2m_teardown(d->arch.altp2m_p2m[i]); -+ p2m_teardown(d->arch.altp2m_p2m[i], true); - } - - /* Destroy nestedp2m's first */ - for (i = 0; i < MAX_NESTEDP2M; i++) { -- p2m_teardown(d->arch.nested_p2m[i]); -+ p2m_teardown(d->arch.nested_p2m[i], true); - } - - if ( d->arch.paging.hap.total_pages != 0 ) - hap_teardown(d, NULL); - -- p2m_teardown(p2m_get_hostp2m(d)); -+ p2m_teardown(p2m_get_hostp2m(d), true); - /* Free any memory that the p2m teardown released */ - paging_lock(d); - hap_set_allocation(d, 0, NULL); -diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c -index def1695cf00b..aba4f17cbe12 100644 ---- a/xen/arch/x86/mm/p2m.c -+++ b/xen/arch/x86/mm/p2m.c -@@ -749,11 +749,11 @@ int p2m_alloc_table(struct p2m_domain *p2m) - * hvm fixme: when adding support for pvh non-hardware domains, this path must - * cleanup any foreign p2m types (release refcnts on them). - */ --void p2m_teardown(struct p2m_domain *p2m) -+void p2m_teardown(struct p2m_domain *p2m, bool remove_root) - /* Return all the p2m pages to Xen. - * We know we don't have any extra mappings to these pages */ - { -- struct page_info *pg; -+ struct page_info *pg, *root_pg = NULL; - struct domain *d; - - if (p2m == NULL) -@@ -763,10 +763,22 @@ void p2m_teardown(struct p2m_domain *p2m) - - p2m_lock(p2m); - ASSERT(atomic_read(&d->shr_pages) == 0); -- p2m->phys_table = pagetable_null(); -+ -+ if ( remove_root ) -+ p2m->phys_table = pagetable_null(); -+ else if ( !pagetable_is_null(p2m->phys_table) ) -+ { -+ root_pg = pagetable_get_page(p2m->phys_table); -+ clear_domain_page(pagetable_get_mfn(p2m->phys_table)); -+ } - - while ( (pg = page_list_remove_head(&p2m->pages)) ) -- d->arch.paging.free_page(d, pg); -+ if ( pg != root_pg ) -+ d->arch.paging.free_page(d, pg); -+ -+ if ( root_pg ) -+ page_list_add(root_pg, &p2m->pages); -+ - p2m_unlock(p2m); - } - -diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c -index 8c1b041f7135..8c5baba9544d 100644 ---- a/xen/arch/x86/mm/shadow/common.c -+++ b/xen/arch/x86/mm/shadow/common.c -@@ -2701,7 +2701,7 @@ int shadow_enable(struct domain *d, u32 mode) - paging_unlock(d); - out_unlocked: - if ( rv != 0 && !pagetable_is_null(p2m_get_pagetable(p2m)) ) -- p2m_teardown(p2m); -+ p2m_teardown(p2m, true); - if ( rv != 0 && pg != NULL ) - { - pg->count_info &= ~PGC_count_mask; -@@ -2866,7 +2866,7 @@ void shadow_final_teardown(struct domain *d) - shadow_teardown(d, NULL); - - /* It is now safe to pull down the p2m map. */ -- p2m_teardown(p2m_get_hostp2m(d)); -+ p2m_teardown(p2m_get_hostp2m(d), true); - /* Free any shadow memory that the p2m teardown released */ - paging_lock(d); - shadow_set_allocation(d, 0, NULL); -diff --git a/xen/include/asm-x86/p2m.h b/xen/include/asm-x86/p2m.h -index f2af7a746ced..c3c16748e7d5 100644 ---- a/xen/include/asm-x86/p2m.h -+++ b/xen/include/asm-x86/p2m.h -@@ -574,7 +574,7 @@ int p2m_init(struct domain *d); - int p2m_alloc_table(struct p2m_domain *p2m); - - /* Return all the p2m resources to Xen. */ --void p2m_teardown(struct p2m_domain *p2m); -+void p2m_teardown(struct p2m_domain *p2m, bool remove_root); - void p2m_final_teardown(struct domain *d); - - /* Add a page to a domain's p2m table */ --- -2.37.4 - diff --git a/0005-tools-Fix-build-with-recent-QEMU-use-enable-trace-ba.patch b/0005-tools-Fix-build-with-recent-QEMU-use-enable-trace-ba.patch new file mode 100644 index 0000000..ff66a43 --- /dev/null +++ b/0005-tools-Fix-build-with-recent-QEMU-use-enable-trace-ba.patch @@ -0,0 +1,50 @@ +From 998c03b2abfbf17ff96bccad1512de1ea18d0d75 Mon Sep 17 00:00:00 2001 +From: Anthony PERARD <anthony.perard@citrix.com> +Date: Tue, 7 Feb 2023 17:03:51 +0100 +Subject: [PATCH 05/61] tools: Fix build with recent QEMU, use + "--enable-trace-backends" + +The configure option "--enable-trace-backend" isn't accepted anymore +and we should use "--enable-trace-backends" instead which was +introduce in 2014 and allow multiple backends. + +"--enable-trace-backends" was introduced by: + 5b808275f3bb ("trace: Multi-backend tracing") +The backward compatible option "--enable-trace-backend" is removed by + 10229ec3b0ff ("configure: remove backwards-compatibility and obsolete options") + +As we already use ./configure options that wouldn't be accepted by +older version of QEMU's configure, we will simply use the new spelling +for the option and avoid trying to detect which spelling to use. + +We already make use if "--firmwarepath=" which was introduced by + 3d5eecab4a5a ("Add --firmwarepath to configure") +which already include the new spelling for "--enable-trace-backends". + +Signed-off-by: Anthony PERARD <anthony.perard@citrix.com> +Reviewed-by: Jason Andryuk <jandryuk@gmail.com> +master commit: e66d450b6e0ffec635639df993ab43ce28b3383f +master date: 2023-01-11 10:45:29 +0100 +--- + tools/Makefile | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/tools/Makefile b/tools/Makefile +index 757a560be0..9b6b605ec9 100644 +--- a/tools/Makefile ++++ b/tools/Makefile +@@ -218,9 +218,9 @@ subdir-all-qemu-xen-dir: qemu-xen-dir-find + mkdir -p qemu-xen-build; \ + cd qemu-xen-build; \ + if $$source/scripts/tracetool.py --check-backend --backend log ; then \ +- enable_trace_backend='--enable-trace-backend=log'; \ ++ enable_trace_backend="--enable-trace-backends=log"; \ + elif $$source/scripts/tracetool.py --check-backend --backend stderr ; then \ +- enable_trace_backend='--enable-trace-backend=stderr'; \ ++ enable_trace_backend='--enable-trace-backends=stderr'; \ + else \ + enable_trace_backend='' ; \ + fi ; \ +-- +2.40.0 + diff --git a/0005-x86-HAP-adjust-monitor-table-related-error-handling.patch b/0005-x86-HAP-adjust-monitor-table-related-error-handling.patch deleted file mode 100644 index 0f48084..0000000 --- a/0005-x86-HAP-adjust-monitor-table-related-error-handling.patch +++ /dev/null @@ -1,77 +0,0 @@ -From 3422c19d85a3d23a9d798eafb739ffb8865522d2 Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Tue, 11 Oct 2022 14:52:59 +0200 -Subject: [PATCH 05/87] x86/HAP: adjust monitor table related error handling -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -hap_make_monitor_table() will return INVALID_MFN if it encounters an -error condition, but hap_update_paging_modes() wasn’t handling this -value, resulting in an inappropriate value being stored in -monitor_table. This would subsequently misguide at least -hap_vcpu_teardown(). Avoid this by bailing early. - -Further, when a domain has/was already crashed or (perhaps less -important as there's no such path known to lead here) is already dying, -avoid calling domain_crash() on it again - that's at best confusing. - -This is part of CVE-2022-33746 / XSA-410. - -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> -master commit: 5b44a61180f4f2e4f490a28400c884dd357ff45d -master date: 2022-10-11 14:21:56 +0200 ---- - xen/arch/x86/mm/hap/hap.c | 14 ++++++++++++-- - 1 file changed, 12 insertions(+), 2 deletions(-) - -diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c -index a8f5a19da917..d75dc2b9ed3d 100644 ---- a/xen/arch/x86/mm/hap/hap.c -+++ b/xen/arch/x86/mm/hap/hap.c -@@ -39,6 +39,7 @@ - #include <asm/domain.h> - #include <xen/numa.h> - #include <asm/hvm/nestedhvm.h> -+#include <public/sched.h> - - #include "private.h" - -@@ -405,8 +406,13 @@ static mfn_t hap_make_monitor_table(struct vcpu *v) - return m4mfn; - - oom: -- printk(XENLOG_G_ERR "out of memory building monitor pagetable\n"); -- domain_crash(d); -+ if ( !d->is_dying && -+ (!d->is_shutting_down || d->shutdown_code != SHUTDOWN_crash) ) -+ { -+ printk(XENLOG_G_ERR "%pd: out of memory building monitor pagetable\n", -+ d); -+ domain_crash(d); -+ } - return INVALID_MFN; - } - -@@ -766,6 +772,9 @@ static void hap_update_paging_modes(struct vcpu *v) - if ( pagetable_is_null(v->arch.hvm.monitor_table) ) - { - mfn_t mmfn = hap_make_monitor_table(v); -+ -+ if ( mfn_eq(mmfn, INVALID_MFN) ) -+ goto unlock; - v->arch.hvm.monitor_table = pagetable_from_mfn(mmfn); - make_cr3(v, mmfn); - hvm_update_host_cr3(v); -@@ -774,6 +783,7 @@ static void hap_update_paging_modes(struct vcpu *v) - /* CR3 is effectively updated by a mode change. Flush ASIDs, etc. */ - hap_update_cr3(v, 0, false); - -+ unlock: - paging_unlock(d); - put_gfn(d, cr3_gfn); - } --- -2.37.4 - diff --git a/0006-x86-shadow-tolerate-failure-of-sh_set_toplevel_shado.patch b/0006-x86-shadow-tolerate-failure-of-sh_set_toplevel_shado.patch deleted file mode 100644 index b9439ca..0000000 --- a/0006-x86-shadow-tolerate-failure-of-sh_set_toplevel_shado.patch +++ /dev/null @@ -1,76 +0,0 @@ -From 40e9daf6b56ae49bda3ba4e254ccf0e998e52a8c Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Tue, 11 Oct 2022 14:53:12 +0200 -Subject: [PATCH 06/87] x86/shadow: tolerate failure of - sh_set_toplevel_shadow() - -Subsequently sh_set_toplevel_shadow() will be adjusted to install a -blank entry in case prealloc fails. There are, in fact, pre-existing -error paths which would put in place a blank entry. The 4- and 2-level -code in sh_update_cr3(), however, assume the top level entry to be -valid. - -Hence bail from the function in the unlikely event that it's not. Note -that 3-level logic works differently: In particular a guest is free to -supply a PDPTR pointing at 4 non-present (or otherwise deemed invalid) -entries. The guest will crash, but we already cope with that. - -Really mfn_valid() is likely wrong to use in sh_set_toplevel_shadow(), -and it should instead be !mfn_eq(gmfn, INVALID_MFN). Avoid such a change -in security context, but add a respective assertion. - -This is part of CVE-2022-33746 / XSA-410. - -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Tim Deegan <tim@xen.org> -Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> -master commit: eac000978c1feb5a9ee3236ab0c0da9a477e5336 -master date: 2022-10-11 14:22:24 +0200 ---- - xen/arch/x86/mm/shadow/common.c | 1 + - xen/arch/x86/mm/shadow/multi.c | 10 ++++++++++ - 2 files changed, 11 insertions(+) - -diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c -index 8c5baba9544d..00e520cbd05b 100644 ---- a/xen/arch/x86/mm/shadow/common.c -+++ b/xen/arch/x86/mm/shadow/common.c -@@ -2516,6 +2516,7 @@ void sh_set_toplevel_shadow(struct vcpu *v, - /* Now figure out the new contents: is this a valid guest MFN? */ - if ( !mfn_valid(gmfn) ) - { -+ ASSERT(mfn_eq(gmfn, INVALID_MFN)); - new_entry = pagetable_null(); - goto install_new_entry; - } -diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c -index 7b8f4dd13b03..2ff78fe3362c 100644 ---- a/xen/arch/x86/mm/shadow/multi.c -+++ b/xen/arch/x86/mm/shadow/multi.c -@@ -3312,6 +3312,11 @@ sh_update_cr3(struct vcpu *v, int do_locking, bool noflush) - if ( sh_remove_write_access(d, gmfn, 4, 0) != 0 ) - guest_flush_tlb_mask(d, d->dirty_cpumask); - sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l4_shadow, sh_make_shadow); -+ if ( unlikely(pagetable_is_null(v->arch.paging.shadow.shadow_table[0])) ) -+ { -+ ASSERT(d->is_dying || d->is_shutting_down); -+ return; -+ } - if ( !shadow_mode_external(d) && !is_pv_32bit_domain(d) ) - { - mfn_t smfn = pagetable_get_mfn(v->arch.paging.shadow.shadow_table[0]); -@@ -3370,6 +3375,11 @@ sh_update_cr3(struct vcpu *v, int do_locking, bool noflush) - if ( sh_remove_write_access(d, gmfn, 2, 0) != 0 ) - guest_flush_tlb_mask(d, d->dirty_cpumask); - sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l2_shadow, sh_make_shadow); -+ if ( unlikely(pagetable_is_null(v->arch.paging.shadow.shadow_table[0])) ) -+ { -+ ASSERT(d->is_dying || d->is_shutting_down); -+ return; -+ } - #else - #error This should never happen - #endif --- -2.37.4 - diff --git a/0006-x86-vmx-Calculate-model-specific-LBRs-once-at-start-.patch b/0006-x86-vmx-Calculate-model-specific-LBRs-once-at-start-.patch new file mode 100644 index 0000000..c010110 --- /dev/null +++ b/0006-x86-vmx-Calculate-model-specific-LBRs-once-at-start-.patch @@ -0,0 +1,342 @@ +From 401e9e33a04c2a9887636ef58490c764543f0538 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper <andrew.cooper3@citrix.com> +Date: Tue, 7 Feb 2023 17:04:18 +0100 +Subject: [PATCH 06/61] x86/vmx: Calculate model-specific LBRs once at start of + day + +There is no point repeating this calculation at runtime, especially as it is +in the fallback path of the WRSMR/RDMSR handlers. + +Move the infrastructure higher in vmx.c to avoid forward declarations, +renaming last_branch_msr_get() to get_model_specific_lbr() to highlight that +these are model-specific only. + +No practical change. + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Kevin Tian <kevin.tian@intel.com> +master commit: e94af0d58f86c3a914b9cbbf4d9ed3d43b974771 +master date: 2023-01-12 18:42:00 +0000 +--- + xen/arch/x86/hvm/vmx/vmx.c | 276 +++++++++++++++++++------------------ + 1 file changed, 139 insertions(+), 137 deletions(-) + +diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c +index 3f42765313..bc308d9df2 100644 +--- a/xen/arch/x86/hvm/vmx/vmx.c ++++ b/xen/arch/x86/hvm/vmx/vmx.c +@@ -394,6 +394,142 @@ void vmx_pi_hooks_deassign(struct domain *d) + domain_unpause(d); + } + ++static const struct lbr_info { ++ u32 base, count; ++} p4_lbr[] = { ++ { MSR_P4_LER_FROM_LIP, 1 }, ++ { MSR_P4_LER_TO_LIP, 1 }, ++ { MSR_P4_LASTBRANCH_TOS, 1 }, ++ { MSR_P4_LASTBRANCH_0_FROM_LIP, NUM_MSR_P4_LASTBRANCH_FROM_TO }, ++ { MSR_P4_LASTBRANCH_0_TO_LIP, NUM_MSR_P4_LASTBRANCH_FROM_TO }, ++ { 0, 0 } ++}, c2_lbr[] = { ++ { MSR_IA32_LASTINTFROMIP, 1 }, ++ { MSR_IA32_LASTINTTOIP, 1 }, ++ { MSR_C2_LASTBRANCH_TOS, 1 }, ++ { MSR_C2_LASTBRANCH_0_FROM_IP, NUM_MSR_C2_LASTBRANCH_FROM_TO }, ++ { MSR_C2_LASTBRANCH_0_TO_IP, NUM_MSR_C2_LASTBRANCH_FROM_TO }, ++ { 0, 0 } ++}, nh_lbr[] = { ++ { MSR_IA32_LASTINTFROMIP, 1 }, ++ { MSR_IA32_LASTINTTOIP, 1 }, ++ { MSR_NHL_LBR_SELECT, 1 }, ++ { MSR_NHL_LASTBRANCH_TOS, 1 }, ++ { MSR_P4_LASTBRANCH_0_FROM_LIP, NUM_MSR_P4_LASTBRANCH_FROM_TO }, ++ { MSR_P4_LASTBRANCH_0_TO_LIP, NUM_MSR_P4_LASTBRANCH_FROM_TO }, ++ { 0, 0 } ++}, sk_lbr[] = { ++ { MSR_IA32_LASTINTFROMIP, 1 }, ++ { MSR_IA32_LASTINTTOIP, 1 }, ++ { MSR_NHL_LBR_SELECT, 1 }, ++ { MSR_NHL_LASTBRANCH_TOS, 1 }, ++ { MSR_SKL_LASTBRANCH_0_FROM_IP, NUM_MSR_SKL_LASTBRANCH }, ++ { MSR_SKL_LASTBRANCH_0_TO_IP, NUM_MSR_SKL_LASTBRANCH }, ++ { MSR_SKL_LASTBRANCH_0_INFO, NUM_MSR_SKL_LASTBRANCH }, ++ { 0, 0 } ++}, at_lbr[] = { ++ { MSR_IA32_LASTINTFROMIP, 1 }, ++ { MSR_IA32_LASTINTTOIP, 1 }, ++ { MSR_C2_LASTBRANCH_TOS, 1 }, ++ { MSR_C2_LASTBRANCH_0_FROM_IP, NUM_MSR_ATOM_LASTBRANCH_FROM_TO }, ++ { MSR_C2_LASTBRANCH_0_TO_IP, NUM_MSR_ATOM_LASTBRANCH_FROM_TO }, ++ { 0, 0 } ++}, sm_lbr[] = { ++ { MSR_IA32_LASTINTFROMIP, 1 }, ++ { MSR_IA32_LASTINTTOIP, 1 }, ++ { MSR_SM_LBR_SELECT, 1 }, ++ { MSR_SM_LASTBRANCH_TOS, 1 }, ++ { MSR_C2_LASTBRANCH_0_FROM_IP, NUM_MSR_ATOM_LASTBRANCH_FROM_TO }, ++ { MSR_C2_LASTBRANCH_0_TO_IP, NUM_MSR_ATOM_LASTBRANCH_FROM_TO }, ++ { 0, 0 } ++}, gm_lbr[] = { ++ { MSR_IA32_LASTINTFROMIP, 1 }, ++ { MSR_IA32_LASTINTTOIP, 1 }, ++ { MSR_SM_LBR_SELECT, 1 }, ++ { MSR_SM_LASTBRANCH_TOS, 1 }, ++ { MSR_GM_LASTBRANCH_0_FROM_IP, NUM_MSR_GM_LASTBRANCH_FROM_TO }, ++ { MSR_GM_LASTBRANCH_0_TO_IP, NUM_MSR_GM_LASTBRANCH_FROM_TO }, ++ { 0, 0 } ++}; ++static const struct lbr_info *__read_mostly model_specific_lbr; ++ ++static const struct lbr_info *__init get_model_specific_lbr(void) ++{ ++ switch ( boot_cpu_data.x86 ) ++ { ++ case 6: ++ switch ( boot_cpu_data.x86_model ) ++ { ++ /* Core2 Duo */ ++ case 0x0f: ++ /* Enhanced Core */ ++ case 0x17: ++ /* Xeon 7400 */ ++ case 0x1d: ++ return c2_lbr; ++ /* Nehalem */ ++ case 0x1a: case 0x1e: case 0x1f: case 0x2e: ++ /* Westmere */ ++ case 0x25: case 0x2c: case 0x2f: ++ /* Sandy Bridge */ ++ case 0x2a: case 0x2d: ++ /* Ivy Bridge */ ++ case 0x3a: case 0x3e: ++ /* Haswell */ ++ case 0x3c: case 0x3f: case 0x45: case 0x46: ++ /* Broadwell */ ++ case 0x3d: case 0x47: case 0x4f: case 0x56: ++ return nh_lbr; ++ /* Skylake */ ++ case 0x4e: case 0x5e: ++ /* Xeon Scalable */ ++ case 0x55: ++ /* Cannon Lake */ ++ case 0x66: ++ /* Goldmont Plus */ ++ case 0x7a: ++ /* Ice Lake */ ++ case 0x6a: case 0x6c: case 0x7d: case 0x7e: ++ /* Tiger Lake */ ++ case 0x8c: case 0x8d: ++ /* Tremont */ ++ case 0x86: ++ /* Kaby Lake */ ++ case 0x8e: case 0x9e: ++ /* Comet Lake */ ++ case 0xa5: case 0xa6: ++ return sk_lbr; ++ /* Atom */ ++ case 0x1c: case 0x26: case 0x27: case 0x35: case 0x36: ++ return at_lbr; ++ /* Silvermont */ ++ case 0x37: case 0x4a: case 0x4d: case 0x5a: case 0x5d: ++ /* Xeon Phi Knights Landing */ ++ case 0x57: ++ /* Xeon Phi Knights Mill */ ++ case 0x85: ++ /* Airmont */ ++ case 0x4c: ++ return sm_lbr; ++ /* Goldmont */ ++ case 0x5c: case 0x5f: ++ return gm_lbr; ++ } ++ break; ++ ++ case 15: ++ switch ( boot_cpu_data.x86_model ) ++ { ++ /* Pentium4/Xeon with em64t */ ++ case 3: case 4: case 6: ++ return p4_lbr; ++ } ++ break; ++ } ++ ++ return NULL; ++} ++ + static int vmx_domain_initialise(struct domain *d) + { + static const struct arch_csw csw = { +@@ -2812,6 +2948,7 @@ const struct hvm_function_table * __init start_vmx(void) + vmx_function_table.get_guest_bndcfgs = vmx_get_guest_bndcfgs; + } + ++ model_specific_lbr = get_model_specific_lbr(); + lbr_tsx_fixup_check(); + ler_to_fixup_check(); + +@@ -2958,141 +3095,6 @@ static int vmx_cr_access(cr_access_qual_t qual) + return X86EMUL_OKAY; + } + +-static const struct lbr_info { +- u32 base, count; +-} p4_lbr[] = { +- { MSR_P4_LER_FROM_LIP, 1 }, +- { MSR_P4_LER_TO_LIP, 1 }, +- { MSR_P4_LASTBRANCH_TOS, 1 }, +- { MSR_P4_LASTBRANCH_0_FROM_LIP, NUM_MSR_P4_LASTBRANCH_FROM_TO }, +- { MSR_P4_LASTBRANCH_0_TO_LIP, NUM_MSR_P4_LASTBRANCH_FROM_TO }, +- { 0, 0 } +-}, c2_lbr[] = { +- { MSR_IA32_LASTINTFROMIP, 1 }, +- { MSR_IA32_LASTINTTOIP, 1 }, +- { MSR_C2_LASTBRANCH_TOS, 1 }, +- { MSR_C2_LASTBRANCH_0_FROM_IP, NUM_MSR_C2_LASTBRANCH_FROM_TO }, +- { MSR_C2_LASTBRANCH_0_TO_IP, NUM_MSR_C2_LASTBRANCH_FROM_TO }, +- { 0, 0 } +-}, nh_lbr[] = { +- { MSR_IA32_LASTINTFROMIP, 1 }, +- { MSR_IA32_LASTINTTOIP, 1 }, +- { MSR_NHL_LBR_SELECT, 1 }, +- { MSR_NHL_LASTBRANCH_TOS, 1 }, +- { MSR_P4_LASTBRANCH_0_FROM_LIP, NUM_MSR_P4_LASTBRANCH_FROM_TO }, +- { MSR_P4_LASTBRANCH_0_TO_LIP, NUM_MSR_P4_LASTBRANCH_FROM_TO }, +- { 0, 0 } +-}, sk_lbr[] = { +- { MSR_IA32_LASTINTFROMIP, 1 }, +- { MSR_IA32_LASTINTTOIP, 1 }, +- { MSR_NHL_LBR_SELECT, 1 }, +- { MSR_NHL_LASTBRANCH_TOS, 1 }, +- { MSR_SKL_LASTBRANCH_0_FROM_IP, NUM_MSR_SKL_LASTBRANCH }, +- { MSR_SKL_LASTBRANCH_0_TO_IP, NUM_MSR_SKL_LASTBRANCH }, +- { MSR_SKL_LASTBRANCH_0_INFO, NUM_MSR_SKL_LASTBRANCH }, +- { 0, 0 } +-}, at_lbr[] = { +- { MSR_IA32_LASTINTFROMIP, 1 }, +- { MSR_IA32_LASTINTTOIP, 1 }, +- { MSR_C2_LASTBRANCH_TOS, 1 }, +- { MSR_C2_LASTBRANCH_0_FROM_IP, NUM_MSR_ATOM_LASTBRANCH_FROM_TO }, +- { MSR_C2_LASTBRANCH_0_TO_IP, NUM_MSR_ATOM_LASTBRANCH_FROM_TO }, +- { 0, 0 } +-}, sm_lbr[] = { +- { MSR_IA32_LASTINTFROMIP, 1 }, +- { MSR_IA32_LASTINTTOIP, 1 }, +- { MSR_SM_LBR_SELECT, 1 }, +- { MSR_SM_LASTBRANCH_TOS, 1 }, +- { MSR_C2_LASTBRANCH_0_FROM_IP, NUM_MSR_ATOM_LASTBRANCH_FROM_TO }, +- { MSR_C2_LASTBRANCH_0_TO_IP, NUM_MSR_ATOM_LASTBRANCH_FROM_TO }, +- { 0, 0 } +-}, gm_lbr[] = { +- { MSR_IA32_LASTINTFROMIP, 1 }, +- { MSR_IA32_LASTINTTOIP, 1 }, +- { MSR_SM_LBR_SELECT, 1 }, +- { MSR_SM_LASTBRANCH_TOS, 1 }, +- { MSR_GM_LASTBRANCH_0_FROM_IP, NUM_MSR_GM_LASTBRANCH_FROM_TO }, +- { MSR_GM_LASTBRANCH_0_TO_IP, NUM_MSR_GM_LASTBRANCH_FROM_TO }, +- { 0, 0 } +-}; +- +-static const struct lbr_info *last_branch_msr_get(void) +-{ +- switch ( boot_cpu_data.x86 ) +- { +- case 6: +- switch ( boot_cpu_data.x86_model ) +- { +- /* Core2 Duo */ +- case 0x0f: +- /* Enhanced Core */ +- case 0x17: +- /* Xeon 7400 */ +- case 0x1d: +- return c2_lbr; +- /* Nehalem */ +- case 0x1a: case 0x1e: case 0x1f: case 0x2e: +- /* Westmere */ +- case 0x25: case 0x2c: case 0x2f: +- /* Sandy Bridge */ +- case 0x2a: case 0x2d: +- /* Ivy Bridge */ +- case 0x3a: case 0x3e: +- /* Haswell */ +- case 0x3c: case 0x3f: case 0x45: case 0x46: +- /* Broadwell */ +- case 0x3d: case 0x47: case 0x4f: case 0x56: +- return nh_lbr; +- /* Skylake */ +- case 0x4e: case 0x5e: +- /* Xeon Scalable */ +- case 0x55: +- /* Cannon Lake */ +- case 0x66: +- /* Goldmont Plus */ +- case 0x7a: +- /* Ice Lake */ +- case 0x6a: case 0x6c: case 0x7d: case 0x7e: +- /* Tiger Lake */ +- case 0x8c: case 0x8d: +- /* Tremont */ +- case 0x86: +- /* Kaby Lake */ +- case 0x8e: case 0x9e: +- /* Comet Lake */ +- case 0xa5: case 0xa6: +- return sk_lbr; +- /* Atom */ +- case 0x1c: case 0x26: case 0x27: case 0x35: case 0x36: +- return at_lbr; +- /* Silvermont */ +- case 0x37: case 0x4a: case 0x4d: case 0x5a: case 0x5d: +- /* Xeon Phi Knights Landing */ +- case 0x57: +- /* Xeon Phi Knights Mill */ +- case 0x85: +- /* Airmont */ +- case 0x4c: +- return sm_lbr; +- /* Goldmont */ +- case 0x5c: case 0x5f: +- return gm_lbr; +- } +- break; +- +- case 15: +- switch ( boot_cpu_data.x86_model ) +- { +- /* Pentium4/Xeon with em64t */ +- case 3: case 4: case 6: +- return p4_lbr; +- } +- break; +- } +- +- return NULL; +-} +- + enum + { + LBR_FORMAT_32 = 0x0, /* 32-bit record format */ +@@ -3199,7 +3201,7 @@ static void __init ler_to_fixup_check(void) + + static int is_last_branch_msr(u32 ecx) + { +- const struct lbr_info *lbr = last_branch_msr_get(); ++ const struct lbr_info *lbr = model_specific_lbr; + + if ( lbr == NULL ) + return 0; +@@ -3536,7 +3538,7 @@ static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content) + if ( !(v->arch.hvm.vmx.lbr_flags & LBR_MSRS_INSERTED) && + (msr_content & IA32_DEBUGCTLMSR_LBR) ) + { +- const struct lbr_info *lbr = last_branch_msr_get(); ++ const struct lbr_info *lbr = model_specific_lbr; + + if ( unlikely(!lbr) ) + { +-- +2.40.0 + diff --git a/0007-x86-shadow-tolerate-failure-in-shadow_prealloc.patch b/0007-x86-shadow-tolerate-failure-in-shadow_prealloc.patch deleted file mode 100644 index d288a0b..0000000 --- a/0007-x86-shadow-tolerate-failure-in-shadow_prealloc.patch +++ /dev/null @@ -1,279 +0,0 @@ -From 28d3f677ec97c98154311f64871ac48762cf980a Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Tue, 11 Oct 2022 14:53:27 +0200 -Subject: [PATCH 07/87] x86/shadow: tolerate failure in shadow_prealloc() -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Prevent _shadow_prealloc() from calling BUG() when unable to fulfill -the pre-allocation and instead return true/false. Modify -shadow_prealloc() to crash the domain on allocation failure (if the -domain is not already dying), as shadow cannot operate normally after -that. Modify callers to also gracefully handle {_,}shadow_prealloc() -failing to fulfill the request. - -Note this in turn requires adjusting the callers of -sh_make_monitor_table() also to handle it returning INVALID_MFN. -sh_update_paging_modes() is also modified to add additional error -paths in case of allocation failure, some of those will return with -null monitor page tables (and the domain likely crashed). This is no -different that current error paths, but the newly introduced ones are -more likely to trigger. - -The now added failure points in sh_update_paging_modes() also require -that on some error return paths the previous structures are cleared, -and thus monitor table is null. - -While there adjust the 'type' parameter type of shadow_prealloc() to -unsigned int rather than u32. - -This is part of CVE-2022-33746 / XSA-410. - -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Tim Deegan <tim@xen.org> -master commit: b7f93c6afb12b6061e2d19de2f39ea09b569ac68 -master date: 2022-10-11 14:22:53 +0200 ---- - xen/arch/x86/mm/shadow/common.c | 69 ++++++++++++++++++++++++-------- - xen/arch/x86/mm/shadow/hvm.c | 4 +- - xen/arch/x86/mm/shadow/multi.c | 11 +++-- - xen/arch/x86/mm/shadow/private.h | 3 +- - 4 files changed, 66 insertions(+), 21 deletions(-) - -diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c -index 00e520cbd05b..2067c7d16bb4 100644 ---- a/xen/arch/x86/mm/shadow/common.c -+++ b/xen/arch/x86/mm/shadow/common.c -@@ -36,6 +36,7 @@ - #include <asm/flushtlb.h> - #include <asm/shadow.h> - #include <xen/numa.h> -+#include <public/sched.h> - #include "private.h" - - DEFINE_PER_CPU(uint32_t,trace_shadow_path_flags); -@@ -928,14 +929,15 @@ static inline void trace_shadow_prealloc_unpin(struct domain *d, mfn_t smfn) - - /* Make sure there are at least count order-sized pages - * available in the shadow page pool. */ --static void _shadow_prealloc(struct domain *d, unsigned int pages) -+static bool __must_check _shadow_prealloc(struct domain *d, unsigned int pages) - { - struct vcpu *v; - struct page_info *sp, *t; - mfn_t smfn; - int i; - -- if ( d->arch.paging.shadow.free_pages >= pages ) return; -+ if ( d->arch.paging.shadow.free_pages >= pages ) -+ return true; - - /* Shouldn't have enabled shadows if we've no vcpus. */ - ASSERT(d->vcpu && d->vcpu[0]); -@@ -951,7 +953,8 @@ static void _shadow_prealloc(struct domain *d, unsigned int pages) - sh_unpin(d, smfn); - - /* See if that freed up enough space */ -- if ( d->arch.paging.shadow.free_pages >= pages ) return; -+ if ( d->arch.paging.shadow.free_pages >= pages ) -+ return true; - } - - /* Stage two: all shadow pages are in use in hierarchies that are -@@ -974,7 +977,7 @@ static void _shadow_prealloc(struct domain *d, unsigned int pages) - if ( d->arch.paging.shadow.free_pages >= pages ) - { - guest_flush_tlb_mask(d, d->dirty_cpumask); -- return; -+ return true; - } - } - } -@@ -987,7 +990,12 @@ static void _shadow_prealloc(struct domain *d, unsigned int pages) - d->arch.paging.shadow.total_pages, - d->arch.paging.shadow.free_pages, - d->arch.paging.shadow.p2m_pages); -- BUG(); -+ -+ ASSERT(d->is_dying); -+ -+ guest_flush_tlb_mask(d, d->dirty_cpumask); -+ -+ return false; - } - - /* Make sure there are at least count pages of the order according to -@@ -995,9 +1003,19 @@ static void _shadow_prealloc(struct domain *d, unsigned int pages) - * This must be called before any calls to shadow_alloc(). Since this - * will free existing shadows to make room, it must be called early enough - * to avoid freeing shadows that the caller is currently working on. */ --void shadow_prealloc(struct domain *d, u32 type, unsigned int count) -+bool shadow_prealloc(struct domain *d, unsigned int type, unsigned int count) - { -- return _shadow_prealloc(d, shadow_size(type) * count); -+ bool ret = _shadow_prealloc(d, shadow_size(type) * count); -+ -+ if ( !ret && !d->is_dying && -+ (!d->is_shutting_down || d->shutdown_code != SHUTDOWN_crash) ) -+ /* -+ * Failing to allocate memory required for shadow usage can only result in -+ * a domain crash, do it here rather that relying on every caller to do it. -+ */ -+ domain_crash(d); -+ -+ return ret; - } - - /* Deliberately free all the memory we can: this will tear down all of -@@ -1218,7 +1236,7 @@ void shadow_free(struct domain *d, mfn_t smfn) - static struct page_info * - shadow_alloc_p2m_page(struct domain *d) - { -- struct page_info *pg; -+ struct page_info *pg = NULL; - - /* This is called both from the p2m code (which never holds the - * paging lock) and the log-dirty code (which always does). */ -@@ -1236,16 +1254,18 @@ shadow_alloc_p2m_page(struct domain *d) - d->arch.paging.shadow.p2m_pages, - shadow_min_acceptable_pages(d)); - } -- paging_unlock(d); -- return NULL; -+ goto out; - } - -- shadow_prealloc(d, SH_type_p2m_table, 1); -+ if ( !shadow_prealloc(d, SH_type_p2m_table, 1) ) -+ goto out; -+ - pg = mfn_to_page(shadow_alloc(d, SH_type_p2m_table, 0)); - d->arch.paging.shadow.p2m_pages++; - d->arch.paging.shadow.total_pages--; - ASSERT(!page_get_owner(pg) && !(pg->count_info & PGC_count_mask)); - -+ out: - paging_unlock(d); - - return pg; -@@ -1336,7 +1356,9 @@ int shadow_set_allocation(struct domain *d, unsigned int pages, bool *preempted) - else if ( d->arch.paging.shadow.total_pages > pages ) - { - /* Need to return memory to domheap */ -- _shadow_prealloc(d, 1); -+ if ( !_shadow_prealloc(d, 1) ) -+ return -ENOMEM; -+ - sp = page_list_remove_head(&d->arch.paging.shadow.freelist); - ASSERT(sp); - /* -@@ -2334,12 +2356,13 @@ static void sh_update_paging_modes(struct vcpu *v) - if ( mfn_eq(v->arch.paging.shadow.oos_snapshot[0], INVALID_MFN) ) - { - int i; -+ -+ if ( !shadow_prealloc(d, SH_type_oos_snapshot, SHADOW_OOS_PAGES) ) -+ return; -+ - for(i = 0; i < SHADOW_OOS_PAGES; i++) -- { -- shadow_prealloc(d, SH_type_oos_snapshot, 1); - v->arch.paging.shadow.oos_snapshot[i] = - shadow_alloc(d, SH_type_oos_snapshot, 0); -- } - } - #endif /* OOS */ - -@@ -2403,6 +2426,9 @@ static void sh_update_paging_modes(struct vcpu *v) - mfn_t mmfn = sh_make_monitor_table( - v, v->arch.paging.mode->shadow.shadow_levels); - -+ if ( mfn_eq(mmfn, INVALID_MFN) ) -+ return; -+ - v->arch.hvm.monitor_table = pagetable_from_mfn(mmfn); - make_cr3(v, mmfn); - hvm_update_host_cr3(v); -@@ -2441,6 +2467,12 @@ static void sh_update_paging_modes(struct vcpu *v) - v->arch.hvm.monitor_table = pagetable_null(); - new_mfn = sh_make_monitor_table( - v, v->arch.paging.mode->shadow.shadow_levels); -+ if ( mfn_eq(new_mfn, INVALID_MFN) ) -+ { -+ sh_destroy_monitor_table(v, old_mfn, -+ old_mode->shadow.shadow_levels); -+ return; -+ } - v->arch.hvm.monitor_table = pagetable_from_mfn(new_mfn); - SHADOW_PRINTK("new monitor table %"PRI_mfn "\n", - mfn_x(new_mfn)); -@@ -2526,7 +2558,12 @@ void sh_set_toplevel_shadow(struct vcpu *v, - if ( !mfn_valid(smfn) ) - { - /* Make sure there's enough free shadow memory. */ -- shadow_prealloc(d, root_type, 1); -+ if ( !shadow_prealloc(d, root_type, 1) ) -+ { -+ new_entry = pagetable_null(); -+ goto install_new_entry; -+ } -+ - /* Shadow the page. */ - smfn = make_shadow(v, gmfn, root_type); - } -diff --git a/xen/arch/x86/mm/shadow/hvm.c b/xen/arch/x86/mm/shadow/hvm.c -index d5f42102a0bd..a0878d9ad71a 100644 ---- a/xen/arch/x86/mm/shadow/hvm.c -+++ b/xen/arch/x86/mm/shadow/hvm.c -@@ -700,7 +700,9 @@ mfn_t sh_make_monitor_table(const struct vcpu *v, unsigned int shadow_levels) - ASSERT(!pagetable_get_pfn(v->arch.hvm.monitor_table)); - - /* Guarantee we can get the memory we need */ -- shadow_prealloc(d, SH_type_monitor_table, CONFIG_PAGING_LEVELS); -+ if ( !shadow_prealloc(d, SH_type_monitor_table, CONFIG_PAGING_LEVELS) ) -+ return INVALID_MFN; -+ - m4mfn = shadow_alloc(d, SH_type_monitor_table, 0); - mfn_to_page(m4mfn)->shadow_flags = 4; - -diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c -index 2ff78fe3362c..c07af0bd99da 100644 ---- a/xen/arch/x86/mm/shadow/multi.c -+++ b/xen/arch/x86/mm/shadow/multi.c -@@ -2440,9 +2440,14 @@ static int sh_page_fault(struct vcpu *v, - * Preallocate shadow pages *before* removing writable accesses - * otherwhise an OOS L1 might be demoted and promoted again with - * writable mappings. */ -- shadow_prealloc(d, -- SH_type_l1_shadow, -- GUEST_PAGING_LEVELS < 4 ? 1 : GUEST_PAGING_LEVELS - 1); -+ if ( !shadow_prealloc(d, SH_type_l1_shadow, -+ GUEST_PAGING_LEVELS < 4 -+ ? 1 : GUEST_PAGING_LEVELS - 1) ) -+ { -+ paging_unlock(d); -+ put_gfn(d, gfn_x(gfn)); -+ return 0; -+ } - - rc = gw_remove_write_accesses(v, va, &gw); - -diff --git a/xen/arch/x86/mm/shadow/private.h b/xen/arch/x86/mm/shadow/private.h -index 35efb1b984fb..738214f75e8d 100644 ---- a/xen/arch/x86/mm/shadow/private.h -+++ b/xen/arch/x86/mm/shadow/private.h -@@ -383,7 +383,8 @@ void shadow_promote(struct domain *d, mfn_t gmfn, u32 type); - void shadow_demote(struct domain *d, mfn_t gmfn, u32 type); - - /* Shadow page allocation functions */ --void shadow_prealloc(struct domain *d, u32 shadow_type, unsigned int count); -+bool __must_check shadow_prealloc(struct domain *d, unsigned int shadow_type, -+ unsigned int count); - mfn_t shadow_alloc(struct domain *d, - u32 shadow_type, - unsigned long backpointer); --- -2.37.4 - diff --git a/0007-x86-vmx-Support-for-CPUs-without-model-specific-LBR.patch b/0007-x86-vmx-Support-for-CPUs-without-model-specific-LBR.patch new file mode 100644 index 0000000..fc81a17 --- /dev/null +++ b/0007-x86-vmx-Support-for-CPUs-without-model-specific-LBR.patch @@ -0,0 +1,83 @@ +From 9f425039ca50e8cc8db350ec54d8a7cd4175f417 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper <andrew.cooper3@citrix.com> +Date: Tue, 7 Feb 2023 17:04:49 +0100 +Subject: [PATCH 07/61] x86/vmx: Support for CPUs without model-specific LBR + +Ice Lake (server at least) has both architectural LBR and model-specific LBR. +Sapphire Rapids does not have model-specific LBR at all. I.e. On SPR and +later, model_specific_lbr will always be NULL, so we must make changes to +avoid reliably hitting the domain_crash(). + +The Arch LBR spec states that CPUs without model-specific LBR implement +MSR_DBG_CTL.LBR by discarding writes and always returning 0. + +Do this for any CPU for which we lack model-specific LBR information. + +Adjust the now-stale comment, now that the Arch LBR spec has created a way to +signal "no model specific LBR" to guests. + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Kevin Tian <kevin.tian@intel.com> +master commit: 3edca52ce736297d7fcf293860cd94ef62638052 +master date: 2023-01-12 18:42:00 +0000 +--- + xen/arch/x86/hvm/vmx/vmx.c | 31 ++++++++++++++++--------------- + 1 file changed, 16 insertions(+), 15 deletions(-) + +diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c +index bc308d9df2..094141be9a 100644 +--- a/xen/arch/x86/hvm/vmx/vmx.c ++++ b/xen/arch/x86/hvm/vmx/vmx.c +@@ -3518,18 +3518,26 @@ static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content) + if ( msr_content & rsvd ) + goto gp_fault; + ++ /* ++ * The Arch LBR spec (new in Ice Lake) states that CPUs with no ++ * model-specific LBRs implement MSR_DBG_CTL.LBR by discarding writes ++ * and always returning 0. ++ * ++ * Use this property in all cases where we don't know any ++ * model-specific LBR information, as it matches real hardware ++ * behaviour on post-Ice Lake systems. ++ */ ++ if ( !model_specific_lbr ) ++ msr_content &= ~IA32_DEBUGCTLMSR_LBR; ++ + /* + * When a guest first enables LBR, arrange to save and restore the LBR + * MSRs and allow the guest direct access. + * +- * MSR_DEBUGCTL and LBR has existed almost as long as MSRs have +- * existed, and there is no architectural way to hide the feature, or +- * fail the attempt to enable LBR. +- * +- * Unknown host LBR MSRs or hitting -ENOSPC with the guest load/save +- * list are definitely hypervisor bugs, whereas -ENOMEM for allocating +- * the load/save list is simply unlucky (and shouldn't occur with +- * sensible management by the toolstack). ++ * Hitting -ENOSPC with the guest load/save list is definitely a ++ * hypervisor bug, whereas -ENOMEM for allocating the load/save list ++ * is simply unlucky (and shouldn't occur with sensible management by ++ * the toolstack). + * + * Either way, there is nothing we can do right now to recover, and + * the guest won't execute correctly either. Simply crash the domain +@@ -3540,13 +3548,6 @@ static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content) + { + const struct lbr_info *lbr = model_specific_lbr; + +- if ( unlikely(!lbr) ) +- { +- gprintk(XENLOG_ERR, "Unknown Host LBR MSRs\n"); +- domain_crash(v->domain); +- return X86EMUL_OKAY; +- } +- + for ( ; lbr->count; lbr++ ) + { + unsigned int i; +-- +2.40.0 + diff --git a/0008-x86-p2m-refuse-new-allocations-for-dying-domains.patch b/0008-x86-p2m-refuse-new-allocations-for-dying-domains.patch deleted file mode 100644 index d89d5b9..0000000 --- a/0008-x86-p2m-refuse-new-allocations-for-dying-domains.patch +++ /dev/null @@ -1,100 +0,0 @@ -From 745e0b300dc3f5000e6d48c273b405d4bcc29ba7 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Tue, 11 Oct 2022 14:53:41 +0200 -Subject: [PATCH 08/87] x86/p2m: refuse new allocations for dying domains -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -This will in particular prevent any attempts to add entries to the p2m, -once - in a subsequent change - non-root entries have been removed. - -This is part of CVE-2022-33746 / XSA-410. - -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Tim Deegan <tim@xen.org> -master commit: ff600a8cf8e36f8ecbffecf96a035952e022ab87 -master date: 2022-10-11 14:23:22 +0200 ---- - xen/arch/x86/mm/hap/hap.c | 5 ++++- - xen/arch/x86/mm/shadow/common.c | 18 ++++++++++++++---- - 2 files changed, 18 insertions(+), 5 deletions(-) - -diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c -index d75dc2b9ed3d..787991233e53 100644 ---- a/xen/arch/x86/mm/hap/hap.c -+++ b/xen/arch/x86/mm/hap/hap.c -@@ -245,6 +245,9 @@ static struct page_info *hap_alloc(struct domain *d) - - ASSERT(paging_locked_by_me(d)); - -+ if ( unlikely(d->is_dying) ) -+ return NULL; -+ - pg = page_list_remove_head(&d->arch.paging.hap.freelist); - if ( unlikely(!pg) ) - return NULL; -@@ -281,7 +284,7 @@ static struct page_info *hap_alloc_p2m_page(struct domain *d) - d->arch.paging.hap.p2m_pages++; - ASSERT(!page_get_owner(pg) && !(pg->count_info & PGC_count_mask)); - } -- else if ( !d->arch.paging.p2m_alloc_failed ) -+ else if ( !d->arch.paging.p2m_alloc_failed && !d->is_dying ) - { - d->arch.paging.p2m_alloc_failed = 1; - dprintk(XENLOG_ERR, "d%i failed to allocate from HAP pool\n", -diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c -index 2067c7d16bb4..9807f6ec6c00 100644 ---- a/xen/arch/x86/mm/shadow/common.c -+++ b/xen/arch/x86/mm/shadow/common.c -@@ -939,6 +939,10 @@ static bool __must_check _shadow_prealloc(struct domain *d, unsigned int pages) - if ( d->arch.paging.shadow.free_pages >= pages ) - return true; - -+ if ( unlikely(d->is_dying) ) -+ /* No reclaim when the domain is dying, teardown will take care of it. */ -+ return false; -+ - /* Shouldn't have enabled shadows if we've no vcpus. */ - ASSERT(d->vcpu && d->vcpu[0]); - -@@ -991,7 +995,7 @@ static bool __must_check _shadow_prealloc(struct domain *d, unsigned int pages) - d->arch.paging.shadow.free_pages, - d->arch.paging.shadow.p2m_pages); - -- ASSERT(d->is_dying); -+ ASSERT_UNREACHABLE(); - - guest_flush_tlb_mask(d, d->dirty_cpumask); - -@@ -1005,10 +1009,13 @@ static bool __must_check _shadow_prealloc(struct domain *d, unsigned int pages) - * to avoid freeing shadows that the caller is currently working on. */ - bool shadow_prealloc(struct domain *d, unsigned int type, unsigned int count) - { -- bool ret = _shadow_prealloc(d, shadow_size(type) * count); -+ bool ret; - -- if ( !ret && !d->is_dying && -- (!d->is_shutting_down || d->shutdown_code != SHUTDOWN_crash) ) -+ if ( unlikely(d->is_dying) ) -+ return false; -+ -+ ret = _shadow_prealloc(d, shadow_size(type) * count); -+ if ( !ret && (!d->is_shutting_down || d->shutdown_code != SHUTDOWN_crash) ) - /* - * Failing to allocate memory required for shadow usage can only result in - * a domain crash, do it here rather that relying on every caller to do it. -@@ -1238,6 +1245,9 @@ shadow_alloc_p2m_page(struct domain *d) - { - struct page_info *pg = NULL; - -+ if ( unlikely(d->is_dying) ) -+ return NULL; -+ - /* This is called both from the p2m code (which never holds the - * paging lock) and the log-dirty code (which always does). */ - paging_lock_recursive(d); --- -2.37.4 - diff --git a/0008-x86-shadow-fix-PAE-check-for-top-level-table-unshado.patch b/0008-x86-shadow-fix-PAE-check-for-top-level-table-unshado.patch new file mode 100644 index 0000000..ab7862b --- /dev/null +++ b/0008-x86-shadow-fix-PAE-check-for-top-level-table-unshado.patch @@ -0,0 +1,39 @@ +From 1550835b381a18fc0e972e5d04925e02fab31553 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Tue, 7 Feb 2023 17:05:22 +0100 +Subject: [PATCH 08/61] x86/shadow: fix PAE check for top-level table + unshadowing + +Clearly within the for_each_vcpu() the vCPU of this loop is meant, not +the (loop invariant) one the fault occurred on. + +Fixes: 3d5e6a3ff383 ("x86 hvm: implement HVMOP_pagetable_dying") +Fixes: ef3b0d8d2c39 ("x86/shadow: shadow_table[] needs only one entry for PV-only configs") +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> +master commit: f8fdceefbb1193ec81667eb40b83bc525cb71204 +master date: 2023-01-20 09:23:42 +0100 +--- + xen/arch/x86/mm/shadow/multi.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c +index c07af0bd99..f7acd18a36 100644 +--- a/xen/arch/x86/mm/shadow/multi.c ++++ b/xen/arch/x86/mm/shadow/multi.c +@@ -2665,10 +2665,10 @@ static int sh_page_fault(struct vcpu *v, + #if GUEST_PAGING_LEVELS == 3 + unsigned int i; + +- for_each_shadow_table(v, i) ++ for_each_shadow_table(tmp, i) + { + mfn_t smfn = pagetable_get_mfn( +- v->arch.paging.shadow.shadow_table[i]); ++ tmp->arch.paging.shadow.shadow_table[i]); + + if ( mfn_valid(smfn) && (mfn_x(smfn) != 0) ) + { +-- +2.40.0 + diff --git a/0009-ns16550-fix-an-incorrect-assignment-to-uart-io_size.patch b/0009-ns16550-fix-an-incorrect-assignment-to-uart-io_size.patch new file mode 100644 index 0000000..83e46c7 --- /dev/null +++ b/0009-ns16550-fix-an-incorrect-assignment-to-uart-io_size.patch @@ -0,0 +1,34 @@ +From 0fd9ad2b9c0c9d9c4879a566f1788d3e9cd38ef6 Mon Sep 17 00:00:00 2001 +From: Ayan Kumar Halder <ayan.kumar.halder@amd.com> +Date: Tue, 7 Feb 2023 17:05:56 +0100 +Subject: [PATCH 09/61] ns16550: fix an incorrect assignment to uart->io_size + +uart->io_size represents the size in bytes. Thus, when serial_port.bit_width +is assigned to it, it should be converted to size in bytes. + +Fixes: 17b516196c ("ns16550: add ACPI support for ARM only") +Reported-by: Jan Beulich <jbeulich@suse.com> +Signed-off-by: Ayan Kumar Halder <ayan.kumar.halder@amd.com> +Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> +master commit: 352c89f72ddb67b8d9d4e492203f8c77f85c8df1 +master date: 2023-01-24 16:54:38 +0100 +--- + xen/drivers/char/ns16550.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xen/drivers/char/ns16550.c b/xen/drivers/char/ns16550.c +index 2d2bd2a024..5dd4d723f5 100644 +--- a/xen/drivers/char/ns16550.c ++++ b/xen/drivers/char/ns16550.c +@@ -1780,7 +1780,7 @@ static int __init ns16550_acpi_uart_init(const void *data) + uart->parity = spcr->parity; + uart->stop_bits = spcr->stop_bits; + uart->io_base = spcr->serial_port.address; +- uart->io_size = spcr->serial_port.bit_width; ++ uart->io_size = DIV_ROUND_UP(spcr->serial_port.bit_width, BITS_PER_BYTE); + uart->reg_shift = spcr->serial_port.bit_offset; + uart->reg_width = spcr->serial_port.access_width; + +-- +2.40.0 + diff --git a/0009-x86-p2m-truly-free-paging-pool-memory-for-dying-doma.patch b/0009-x86-p2m-truly-free-paging-pool-memory-for-dying-doma.patch deleted file mode 100644 index 57620cd..0000000 --- a/0009-x86-p2m-truly-free-paging-pool-memory-for-dying-doma.patch +++ /dev/null @@ -1,115 +0,0 @@ -From 943635d8f8486209e4e48966507ad57963e96284 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Tue, 11 Oct 2022 14:54:00 +0200 -Subject: [PATCH 09/87] x86/p2m: truly free paging pool memory for dying - domains -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Modify {hap,shadow}_free to free the page immediately if the domain is -dying, so that pages don't accumulate in the pool when -{shadow,hap}_final_teardown() get called. This is to limit the amount of -work which needs to be done there (in a non-preemptable manner). - -Note the call to shadow_free() in shadow_free_p2m_page() is moved after -increasing total_pages, so that the decrease done in shadow_free() in -case the domain is dying doesn't underflow the counter, even if just for -a short interval. - -This is part of CVE-2022-33746 / XSA-410. - -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Tim Deegan <tim@xen.org> -master commit: f50a2c0e1d057c00d6061f40ae24d068226052ad -master date: 2022-10-11 14:23:51 +0200 ---- - xen/arch/x86/mm/hap/hap.c | 12 ++++++++++++ - xen/arch/x86/mm/shadow/common.c | 28 +++++++++++++++++++++++++--- - 2 files changed, 37 insertions(+), 3 deletions(-) - -diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c -index 787991233e53..aef2297450e1 100644 ---- a/xen/arch/x86/mm/hap/hap.c -+++ b/xen/arch/x86/mm/hap/hap.c -@@ -265,6 +265,18 @@ static void hap_free(struct domain *d, mfn_t mfn) - - ASSERT(paging_locked_by_me(d)); - -+ /* -+ * For dying domains, actually free the memory here. This way less work is -+ * left to hap_final_teardown(), which cannot easily have preemption checks -+ * added. -+ */ -+ if ( unlikely(d->is_dying) ) -+ { -+ free_domheap_page(pg); -+ d->arch.paging.hap.total_pages--; -+ return; -+ } -+ - d->arch.paging.hap.free_pages++; - page_list_add_tail(pg, &d->arch.paging.hap.freelist); - } -diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c -index 9807f6ec6c00..9eb33eafc7f7 100644 ---- a/xen/arch/x86/mm/shadow/common.c -+++ b/xen/arch/x86/mm/shadow/common.c -@@ -1187,6 +1187,7 @@ mfn_t shadow_alloc(struct domain *d, - void shadow_free(struct domain *d, mfn_t smfn) - { - struct page_info *next = NULL, *sp = mfn_to_page(smfn); -+ bool dying = ACCESS_ONCE(d->is_dying); - struct page_list_head *pin_list; - unsigned int pages; - u32 shadow_type; -@@ -1229,11 +1230,32 @@ void shadow_free(struct domain *d, mfn_t smfn) - * just before the allocator hands the page out again. */ - page_set_tlbflush_timestamp(sp); - perfc_decr(shadow_alloc_count); -- page_list_add_tail(sp, &d->arch.paging.shadow.freelist); -+ -+ /* -+ * For dying domains, actually free the memory here. This way less -+ * work is left to shadow_final_teardown(), which cannot easily have -+ * preemption checks added. -+ */ -+ if ( unlikely(dying) ) -+ { -+ /* -+ * The backpointer field (sh.back) used by shadow code aliases the -+ * domain owner field, unconditionally clear it here to avoid -+ * free_domheap_page() attempting to parse it. -+ */ -+ page_set_owner(sp, NULL); -+ free_domheap_page(sp); -+ } -+ else -+ page_list_add_tail(sp, &d->arch.paging.shadow.freelist); -+ - sp = next; - } - -- d->arch.paging.shadow.free_pages += pages; -+ if ( unlikely(dying) ) -+ d->arch.paging.shadow.total_pages -= pages; -+ else -+ d->arch.paging.shadow.free_pages += pages; - } - - /* Divert a page from the pool to be used by the p2m mapping. -@@ -1303,9 +1325,9 @@ shadow_free_p2m_page(struct domain *d, struct page_info *pg) - * paging lock) and the log-dirty code (which always does). */ - paging_lock_recursive(d); - -- shadow_free(d, page_to_mfn(pg)); - d->arch.paging.shadow.p2m_pages--; - d->arch.paging.shadow.total_pages++; -+ shadow_free(d, page_to_mfn(pg)); - - paging_unlock(d); - } --- -2.37.4 - diff --git a/0010-libxl-fix-guest-kexec-skip-cpuid-policy.patch b/0010-libxl-fix-guest-kexec-skip-cpuid-policy.patch new file mode 100644 index 0000000..6150286 --- /dev/null +++ b/0010-libxl-fix-guest-kexec-skip-cpuid-policy.patch @@ -0,0 +1,72 @@ +From 6e081438bf8ef616d0123aab7a743476d8114ef6 Mon Sep 17 00:00:00 2001 +From: Jason Andryuk <jandryuk@gmail.com> +Date: Tue, 7 Feb 2023 17:06:47 +0100 +Subject: [PATCH 10/61] libxl: fix guest kexec - skip cpuid policy + +When a domain performs a kexec (soft reset), libxl__build_pre() is +called with the existing domid. Calling libxl__cpuid_legacy() on the +existing domain fails since the cpuid policy has already been set, and +the guest isn't rebuilt and doesn't kexec. + +xc: error: Failed to set d1's policy (err leaf 0xffffffff, subleaf 0xffffffff, msr 0xffffffff) (17 = File exists): Internal error +libxl: error: libxl_cpuid.c:494:libxl__cpuid_legacy: Domain 1:Failed to apply CPUID policy: File exists +libxl: error: libxl_create.c:1641:domcreate_rebuild_done: Domain 1:cannot (re-)build domain: -3 +libxl: error: libxl_xshelp.c:201:libxl__xs_read_mandatory: xenstore read failed: `/libxl/1/type': No such file or directory +libxl: warning: libxl_dom.c:49:libxl__domain_type: unable to get domain type for domid=1, assuming HVM + +During a soft_reset, skip calling libxl__cpuid_legacy() to avoid the +issue. Before commit 34990446ca91, the libxl__cpuid_legacy() failure +would have been ignored, so kexec would continue. + +Fixes: 34990446ca91 ("libxl: don't ignore the return value from xc_cpuid_apply_policy") +Signed-off-by: Jason Andryuk <jandryuk@gmail.com> +Reviewed-by: Anthony PERARD <anthony.perard@citrix.com> +master commit: 1e454c2b5b1172e0fc7457e411ebaba61db8fc87 +master date: 2023-01-26 10:58:23 +0100 +--- + tools/libs/light/libxl_create.c | 2 ++ + tools/libs/light/libxl_dom.c | 2 +- + tools/libs/light/libxl_internal.h | 1 + + 3 files changed, 4 insertions(+), 1 deletion(-) + +diff --git a/tools/libs/light/libxl_create.c b/tools/libs/light/libxl_create.c +index 885675591f..2e6357a9d7 100644 +--- a/tools/libs/light/libxl_create.c ++++ b/tools/libs/light/libxl_create.c +@@ -2176,6 +2176,8 @@ static int do_domain_soft_reset(libxl_ctx *ctx, + aop_console_how); + cdcs->domid_out = &domid_out; + ++ state->soft_reset = true; ++ + dom_path = libxl__xs_get_dompath(gc, domid); + if (!dom_path) { + LOGD(ERROR, domid, "failed to read domain path"); +diff --git a/tools/libs/light/libxl_dom.c b/tools/libs/light/libxl_dom.c +index 73fccd9243..a2bd2395fa 100644 +--- a/tools/libs/light/libxl_dom.c ++++ b/tools/libs/light/libxl_dom.c +@@ -384,7 +384,7 @@ int libxl__build_pre(libxl__gc *gc, uint32_t domid, + /* Construct a CPUID policy, but only for brand new domains. Domains + * being migrated-in/restored have CPUID handled during the + * static_data_done() callback. */ +- if (!state->restore) ++ if (!state->restore && !state->soft_reset) + rc = libxl__cpuid_legacy(ctx, domid, false, info); + + out: +diff --git a/tools/libs/light/libxl_internal.h b/tools/libs/light/libxl_internal.h +index 0b4671318c..ee6a251700 100644 +--- a/tools/libs/light/libxl_internal.h ++++ b/tools/libs/light/libxl_internal.h +@@ -1407,6 +1407,7 @@ typedef struct { + /* Whether this domain is being migrated/restored, or booting fresh. Only + * applicable to the primary domain, not support domains (e.g. stub QEMU). */ + bool restore; ++ bool soft_reset; + } libxl__domain_build_state; + + _hidden void libxl__domain_build_state_init(libxl__domain_build_state *s); +-- +2.40.0 + diff --git a/0010-x86-p2m-free-the-paging-memory-pool-preemptively.patch b/0010-x86-p2m-free-the-paging-memory-pool-preemptively.patch deleted file mode 100644 index 8c80e31..0000000 --- a/0010-x86-p2m-free-the-paging-memory-pool-preemptively.patch +++ /dev/null @@ -1,181 +0,0 @@ -From f5959ed715e19cf2844656477dbf74c2f576c9d4 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Tue, 11 Oct 2022 14:54:21 +0200 -Subject: [PATCH 10/87] x86/p2m: free the paging memory pool preemptively -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -The paging memory pool is currently freed in two different places: -from {shadow,hap}_teardown() via domain_relinquish_resources() and -from {shadow,hap}_final_teardown() via complete_domain_destroy(). -While the former does handle preemption, the later doesn't. - -Attempt to move as much p2m related freeing as possible to happen -before the call to {shadow,hap}_teardown(), so that most memory can be -freed in a preemptive way. In order to avoid causing issues to -existing callers leave the root p2m page tables set and free them in -{hap,shadow}_final_teardown(). Also modify {hap,shadow}_free to free -the page immediately if the domain is dying, so that pages don't -accumulate in the pool when {shadow,hap}_final_teardown() get called. - -Move altp2m_vcpu_disable_ve() to be done in hap_teardown(), as that's -the place where altp2m_active gets disabled now. - -This is part of CVE-2022-33746 / XSA-410. - -Reported-by: Julien Grall <jgrall@amazon.com> -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Tim Deegan <tim@xen.org> -master commit: e7aa55c0aab36d994bf627c92bd5386ae167e16e -master date: 2022-10-11 14:24:21 +0200 ---- - xen/arch/x86/domain.c | 7 ------ - xen/arch/x86/mm/hap/hap.c | 42 ++++++++++++++++++++------------- - xen/arch/x86/mm/shadow/common.c | 12 ++++++++++ - 3 files changed, 38 insertions(+), 23 deletions(-) - -diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c -index 0d39981550ca..a4356893bdbc 100644 ---- a/xen/arch/x86/domain.c -+++ b/xen/arch/x86/domain.c -@@ -38,7 +38,6 @@ - #include <xen/livepatch.h> - #include <public/sysctl.h> - #include <public/hvm/hvm_vcpu.h> --#include <asm/altp2m.h> - #include <asm/regs.h> - #include <asm/mc146818rtc.h> - #include <asm/system.h> -@@ -2381,12 +2380,6 @@ int domain_relinquish_resources(struct domain *d) - vpmu_destroy(v); - } - -- if ( altp2m_active(d) ) -- { -- for_each_vcpu ( d, v ) -- altp2m_vcpu_disable_ve(v); -- } -- - if ( is_pv_domain(d) ) - { - for_each_vcpu ( d, v ) -diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c -index aef2297450e1..a44fcfd95e1e 100644 ---- a/xen/arch/x86/mm/hap/hap.c -+++ b/xen/arch/x86/mm/hap/hap.c -@@ -28,6 +28,7 @@ - #include <xen/domain_page.h> - #include <xen/guest_access.h> - #include <xen/keyhandler.h> -+#include <asm/altp2m.h> - #include <asm/event.h> - #include <asm/page.h> - #include <asm/current.h> -@@ -546,24 +547,8 @@ void hap_final_teardown(struct domain *d) - unsigned int i; - - if ( hvm_altp2m_supported() ) -- { -- d->arch.altp2m_active = 0; -- -- if ( d->arch.altp2m_eptp ) -- { -- free_xenheap_page(d->arch.altp2m_eptp); -- d->arch.altp2m_eptp = NULL; -- } -- -- if ( d->arch.altp2m_visible_eptp ) -- { -- free_xenheap_page(d->arch.altp2m_visible_eptp); -- d->arch.altp2m_visible_eptp = NULL; -- } -- - for ( i = 0; i < MAX_ALTP2M; i++ ) - p2m_teardown(d->arch.altp2m_p2m[i], true); -- } - - /* Destroy nestedp2m's first */ - for (i = 0; i < MAX_NESTEDP2M; i++) { -@@ -578,6 +563,8 @@ void hap_final_teardown(struct domain *d) - paging_lock(d); - hap_set_allocation(d, 0, NULL); - ASSERT(d->arch.paging.hap.p2m_pages == 0); -+ ASSERT(d->arch.paging.hap.free_pages == 0); -+ ASSERT(d->arch.paging.hap.total_pages == 0); - paging_unlock(d); - } - -@@ -603,6 +590,7 @@ void hap_vcpu_teardown(struct vcpu *v) - void hap_teardown(struct domain *d, bool *preempted) - { - struct vcpu *v; -+ unsigned int i; - - ASSERT(d->is_dying); - ASSERT(d != current->domain); -@@ -611,6 +599,28 @@ void hap_teardown(struct domain *d, bool *preempted) - for_each_vcpu ( d, v ) - hap_vcpu_teardown(v); - -+ /* Leave the root pt in case we get further attempts to modify the p2m. */ -+ if ( hvm_altp2m_supported() ) -+ { -+ if ( altp2m_active(d) ) -+ for_each_vcpu ( d, v ) -+ altp2m_vcpu_disable_ve(v); -+ -+ d->arch.altp2m_active = 0; -+ -+ FREE_XENHEAP_PAGE(d->arch.altp2m_eptp); -+ FREE_XENHEAP_PAGE(d->arch.altp2m_visible_eptp); -+ -+ for ( i = 0; i < MAX_ALTP2M; i++ ) -+ p2m_teardown(d->arch.altp2m_p2m[i], false); -+ } -+ -+ /* Destroy nestedp2m's after altp2m. */ -+ for ( i = 0; i < MAX_NESTEDP2M; i++ ) -+ p2m_teardown(d->arch.nested_p2m[i], false); -+ -+ p2m_teardown(p2m_get_hostp2m(d), false); -+ - paging_lock(d); /* Keep various asserts happy */ - - if ( d->arch.paging.hap.total_pages != 0 ) -diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c -index 9eb33eafc7f7..ac9a1ae07808 100644 ---- a/xen/arch/x86/mm/shadow/common.c -+++ b/xen/arch/x86/mm/shadow/common.c -@@ -2824,8 +2824,17 @@ void shadow_teardown(struct domain *d, bool *preempted) - for_each_vcpu ( d, v ) - shadow_vcpu_teardown(v); - -+ p2m_teardown(p2m_get_hostp2m(d), false); -+ - paging_lock(d); - -+ /* -+ * Reclaim all shadow memory so that shadow_set_allocation() doesn't find -+ * in-use pages, as _shadow_prealloc() will no longer try to reclaim pages -+ * because the domain is dying. -+ */ -+ shadow_blow_tables(d); -+ - #if (SHADOW_OPTIMIZATIONS & (SHOPT_VIRTUAL_TLB|SHOPT_OUT_OF_SYNC)) - /* Free the virtual-TLB array attached to each vcpu */ - for_each_vcpu(d, v) -@@ -2946,6 +2955,9 @@ void shadow_final_teardown(struct domain *d) - d->arch.paging.shadow.total_pages, - d->arch.paging.shadow.free_pages, - d->arch.paging.shadow.p2m_pages); -+ ASSERT(!d->arch.paging.shadow.total_pages); -+ ASSERT(!d->arch.paging.shadow.free_pages); -+ ASSERT(!d->arch.paging.shadow.p2m_pages); - paging_unlock(d); - } - --- -2.37.4 - diff --git a/0011-tools-ocaml-xenctrl-Make-domain_getinfolist-tail-rec.patch b/0011-tools-ocaml-xenctrl-Make-domain_getinfolist-tail-rec.patch new file mode 100644 index 0000000..1d4455f --- /dev/null +++ b/0011-tools-ocaml-xenctrl-Make-domain_getinfolist-tail-rec.patch @@ -0,0 +1,71 @@ +From c6a3d14df051bae0323af539e34cf5a65fba1112 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= <edvin.torok@citrix.com> +Date: Tue, 1 Nov 2022 17:59:16 +0000 +Subject: [PATCH 11/61] tools/ocaml/xenctrl: Make domain_getinfolist tail + recursive +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +domain_getinfolist() is quadratic with the number of domains, because of the +behaviour of the underlying hypercall. xenopsd was further observed to be +wasting excessive quantites of time manipulating the list of already-obtained +domains. + +Implement a tail recursive `rev_concat` equivalent to `concat |> rev`, and use +it instead of calling `@` multiple times. + +An incidental benefit is that the list of domains will now be in domid order, +instead of having pairs of 2 domains changing direction every time. + +In a scalability testing scenario with ~1000 VMs, a combination of this and +the subsequent change takes xenopsd's wallclock time in domain_getinfolist() +down from 88% to 0.02% + +Signed-off-by: Edwin Török <edvin.torok@citrix.com> +Tested-by: Pau Ruiz Safont <pau.safont@citrix.com> +Acked-by: Christian Lindig <christian.lindig@citrix.com> +(cherry picked from commit c3b6be714c64aa62b56d0bce96f4b6a10b5c2078) +--- + tools/ocaml/libs/xc/xenctrl.ml | 23 +++++++++++++++++------ + 1 file changed, 17 insertions(+), 6 deletions(-) + +diff --git a/tools/ocaml/libs/xc/xenctrl.ml b/tools/ocaml/libs/xc/xenctrl.ml +index 7503031d8f..f10b686215 100644 +--- a/tools/ocaml/libs/xc/xenctrl.ml ++++ b/tools/ocaml/libs/xc/xenctrl.ml +@@ -212,14 +212,25 @@ external domain_shutdown: handle -> domid -> shutdown_reason -> unit + external _domain_getinfolist: handle -> domid -> int -> domaininfo list + = "stub_xc_domain_getinfolist" + ++let rev_append_fold acc e = List.rev_append e acc ++ ++(** ++ * [rev_concat lst] is equivalent to [lst |> List.concat |> List.rev] ++ * except it is tail recursive, whereas [List.concat] isn't. ++ * Example: ++ * rev_concat [[10;9;8];[7;6];[5]]] = [5; 6; 7; 8; 9; 10] ++ *) ++let rev_concat lst = List.fold_left rev_append_fold [] lst ++ + let domain_getinfolist handle first_domain = + let nb = 2 in +- let last_domid l = (List.hd l).domid + 1 in +- let rec __getlist from = +- let l = _domain_getinfolist handle from nb in +- (if List.length l = nb then __getlist (last_domid l) else []) @ l +- in +- List.rev (__getlist first_domain) ++ let rec __getlist lst from = ++ (* _domain_getinfolist returns domains in reverse order, largest first *) ++ match _domain_getinfolist handle from nb with ++ | [] -> rev_concat lst ++ | (hd :: _) as l -> __getlist (l :: lst) (hd.domid + 1) ++ in ++ __getlist [] first_domain + + external domain_getinfo: handle -> domid -> domaininfo= "stub_xc_domain_getinfo" + +-- +2.40.0 + diff --git a/0011-xen-x86-p2m-Add-preemption-in-p2m_teardown.patch b/0011-xen-x86-p2m-Add-preemption-in-p2m_teardown.patch deleted file mode 100644 index 096656a..0000000 --- a/0011-xen-x86-p2m-Add-preemption-in-p2m_teardown.patch +++ /dev/null @@ -1,197 +0,0 @@ -From a603386b422f5cb4c5e2639a7e20a1d99dba2175 Mon Sep 17 00:00:00 2001 -From: Julien Grall <jgrall@amazon.com> -Date: Tue, 11 Oct 2022 14:54:44 +0200 -Subject: [PATCH 11/87] xen/x86: p2m: Add preemption in p2m_teardown() - -The list p2m->pages contain all the pages used by the P2M. On large -instance this can be quite large and the time spent to call -d->arch.paging.free_page() will take more than 1ms for a 80GB guest -on a Xen running in nested environment on a c5.metal. - -By extrapolation, it would take > 100ms for a 8TB guest (what we -current security support). So add some preemption in p2m_teardown() -and propagate to the callers. Note there are 3 places where -the preemption is not enabled: - - hap_final_teardown()/shadow_final_teardown(): We are - preventing update the P2M once the domain is dying (so - no more pages could be allocated) and most of the P2M pages - will be freed in preemptive manneer when relinquishing the - resources. So this is fine to disable preemption. - - shadow_enable(): This is fine because it will undo the allocation - that may have been made by p2m_alloc_table() (so only the root - page table). - -The preemption is arbitrarily checked every 1024 iterations. - -We now need to include <xen/event.h> in p2m-basic in order to -import the definition for local_events_need_delivery() used by -general_preempt_check(). Ideally, the inclusion should happen in -xen/sched.h but it opened a can of worms. - -Note that with the current approach, Xen doesn't keep track on whether -the alt/nested P2Ms have been cleared. So there are some redundant work. -However, this is not expected to incurr too much overhead (the P2M lock -shouldn't be contended during teardown). So this is optimization is -left outside of the security event. - -This is part of CVE-2022-33746 / XSA-410. - -Signed-off-by: Julien Grall <jgrall@amazon.com> -Signed-off-by: Jan Beulich <jbeulich@suse.com> -master commit: 8a2111250b424edc49c65c4d41b276766d30635c -master date: 2022-10-11 14:24:48 +0200 ---- - xen/arch/x86/mm/hap/hap.c | 22 ++++++++++++++++------ - xen/arch/x86/mm/p2m.c | 18 +++++++++++++++--- - xen/arch/x86/mm/shadow/common.c | 12 +++++++++--- - xen/include/asm-x86/p2m.h | 2 +- - 4 files changed, 41 insertions(+), 13 deletions(-) - -diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c -index a44fcfd95e1e..1f9a157a0c34 100644 ---- a/xen/arch/x86/mm/hap/hap.c -+++ b/xen/arch/x86/mm/hap/hap.c -@@ -548,17 +548,17 @@ void hap_final_teardown(struct domain *d) - - if ( hvm_altp2m_supported() ) - for ( i = 0; i < MAX_ALTP2M; i++ ) -- p2m_teardown(d->arch.altp2m_p2m[i], true); -+ p2m_teardown(d->arch.altp2m_p2m[i], true, NULL); - - /* Destroy nestedp2m's first */ - for (i = 0; i < MAX_NESTEDP2M; i++) { -- p2m_teardown(d->arch.nested_p2m[i], true); -+ p2m_teardown(d->arch.nested_p2m[i], true, NULL); - } - - if ( d->arch.paging.hap.total_pages != 0 ) - hap_teardown(d, NULL); - -- p2m_teardown(p2m_get_hostp2m(d), true); -+ p2m_teardown(p2m_get_hostp2m(d), true, NULL); - /* Free any memory that the p2m teardown released */ - paging_lock(d); - hap_set_allocation(d, 0, NULL); -@@ -612,14 +612,24 @@ void hap_teardown(struct domain *d, bool *preempted) - FREE_XENHEAP_PAGE(d->arch.altp2m_visible_eptp); - - for ( i = 0; i < MAX_ALTP2M; i++ ) -- p2m_teardown(d->arch.altp2m_p2m[i], false); -+ { -+ p2m_teardown(d->arch.altp2m_p2m[i], false, preempted); -+ if ( preempted && *preempted ) -+ return; -+ } - } - - /* Destroy nestedp2m's after altp2m. */ - for ( i = 0; i < MAX_NESTEDP2M; i++ ) -- p2m_teardown(d->arch.nested_p2m[i], false); -+ { -+ p2m_teardown(d->arch.nested_p2m[i], false, preempted); -+ if ( preempted && *preempted ) -+ return; -+ } - -- p2m_teardown(p2m_get_hostp2m(d), false); -+ p2m_teardown(p2m_get_hostp2m(d), false, preempted); -+ if ( preempted && *preempted ) -+ return; - - paging_lock(d); /* Keep various asserts happy */ - -diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c -index aba4f17cbe12..8781df9dda8d 100644 ---- a/xen/arch/x86/mm/p2m.c -+++ b/xen/arch/x86/mm/p2m.c -@@ -749,12 +749,13 @@ int p2m_alloc_table(struct p2m_domain *p2m) - * hvm fixme: when adding support for pvh non-hardware domains, this path must - * cleanup any foreign p2m types (release refcnts on them). - */ --void p2m_teardown(struct p2m_domain *p2m, bool remove_root) -+void p2m_teardown(struct p2m_domain *p2m, bool remove_root, bool *preempted) - /* Return all the p2m pages to Xen. - * We know we don't have any extra mappings to these pages */ - { - struct page_info *pg, *root_pg = NULL; - struct domain *d; -+ unsigned int i = 0; - - if (p2m == NULL) - return; -@@ -773,8 +774,19 @@ void p2m_teardown(struct p2m_domain *p2m, bool remove_root) - } - - while ( (pg = page_list_remove_head(&p2m->pages)) ) -- if ( pg != root_pg ) -- d->arch.paging.free_page(d, pg); -+ { -+ if ( pg == root_pg ) -+ continue; -+ -+ d->arch.paging.free_page(d, pg); -+ -+ /* Arbitrarily check preemption every 1024 iterations */ -+ if ( preempted && !(++i % 1024) && general_preempt_check() ) -+ { -+ *preempted = true; -+ break; -+ } -+ } - - if ( root_pg ) - page_list_add(root_pg, &p2m->pages); -diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c -index ac9a1ae07808..3b0d781991b5 100644 ---- a/xen/arch/x86/mm/shadow/common.c -+++ b/xen/arch/x86/mm/shadow/common.c -@@ -2770,8 +2770,12 @@ int shadow_enable(struct domain *d, u32 mode) - out_locked: - paging_unlock(d); - out_unlocked: -+ /* -+ * This is fine to ignore the preemption here because only the root -+ * will be allocated by p2m_alloc_table(). -+ */ - if ( rv != 0 && !pagetable_is_null(p2m_get_pagetable(p2m)) ) -- p2m_teardown(p2m, true); -+ p2m_teardown(p2m, true, NULL); - if ( rv != 0 && pg != NULL ) - { - pg->count_info &= ~PGC_count_mask; -@@ -2824,7 +2828,9 @@ void shadow_teardown(struct domain *d, bool *preempted) - for_each_vcpu ( d, v ) - shadow_vcpu_teardown(v); - -- p2m_teardown(p2m_get_hostp2m(d), false); -+ p2m_teardown(p2m_get_hostp2m(d), false, preempted); -+ if ( preempted && *preempted ) -+ return; - - paging_lock(d); - -@@ -2945,7 +2951,7 @@ void shadow_final_teardown(struct domain *d) - shadow_teardown(d, NULL); - - /* It is now safe to pull down the p2m map. */ -- p2m_teardown(p2m_get_hostp2m(d), true); -+ p2m_teardown(p2m_get_hostp2m(d), true, NULL); - /* Free any shadow memory that the p2m teardown released */ - paging_lock(d); - shadow_set_allocation(d, 0, NULL); -diff --git a/xen/include/asm-x86/p2m.h b/xen/include/asm-x86/p2m.h -index c3c16748e7d5..2db9ab0122f2 100644 ---- a/xen/include/asm-x86/p2m.h -+++ b/xen/include/asm-x86/p2m.h -@@ -574,7 +574,7 @@ int p2m_init(struct domain *d); - int p2m_alloc_table(struct p2m_domain *p2m); - - /* Return all the p2m resources to Xen. */ --void p2m_teardown(struct p2m_domain *p2m, bool remove_root); -+void p2m_teardown(struct p2m_domain *p2m, bool remove_root, bool *preempted); - void p2m_final_teardown(struct domain *d); - - /* Add a page to a domain's p2m table */ --- -2.37.4 - diff --git a/0012-libxl-docs-Use-arch-specific-default-paging-memory.patch b/0012-libxl-docs-Use-arch-specific-default-paging-memory.patch deleted file mode 100644 index d1aeae9..0000000 --- a/0012-libxl-docs-Use-arch-specific-default-paging-memory.patch +++ /dev/null @@ -1,149 +0,0 @@ -From 755a9b52844de3e1e47aa1fc9991a4240ccfbf35 Mon Sep 17 00:00:00 2001 -From: Henry Wang <Henry.Wang@arm.com> -Date: Tue, 11 Oct 2022 14:55:08 +0200 -Subject: [PATCH 12/87] libxl, docs: Use arch-specific default paging memory - -The default paging memory (descibed in `shadow_memory` entry in xl -config) in libxl is used to determine the memory pool size for xl -guests. Currently this size is only used for x86, and contains a part -of RAM to shadow the resident processes. Since on Arm there is no -shadow mode guests, so the part of RAM to shadow the resident processes -is not necessary. Therefore, this commit splits the function -`libxl_get_required_shadow_memory()` to arch specific helpers and -renamed the helper to `libxl__arch_get_required_paging_memory()`. - -On x86, this helper calls the original value from -`libxl_get_required_shadow_memory()` so no functional change intended. - -On Arm, this helper returns 1MB per vcpu plus 4KB per MiB of RAM -for the P2M map and additional 512KB. - -Also update the xl.cfg documentation to add Arm documentation -according to code changes and correct the comment style following Xen -coding style. - -This is part of CVE-2022-33747 / XSA-409. - -Suggested-by: Julien Grall <jgrall@amazon.com> -Signed-off-by: Henry Wang <Henry.Wang@arm.com> -Reviewed-by: Anthony PERARD <anthony.perard@citrix.com> -master commit: 156a239ea288972425f967ac807b3cb5b5e14874 -master date: 2022-10-11 14:28:37 +0200 ---- - docs/man/xl.cfg.5.pod.in | 5 +++++ - tools/libs/light/libxl_arch.h | 4 ++++ - tools/libs/light/libxl_arm.c | 14 ++++++++++++++ - tools/libs/light/libxl_utils.c | 9 ++------- - tools/libs/light/libxl_x86.c | 13 +++++++++++++ - 5 files changed, 38 insertions(+), 7 deletions(-) - -diff --git a/docs/man/xl.cfg.5.pod.in b/docs/man/xl.cfg.5.pod.in -index b98d1613987e..eda1e77ebd06 100644 ---- a/docs/man/xl.cfg.5.pod.in -+++ b/docs/man/xl.cfg.5.pod.in -@@ -1768,6 +1768,11 @@ are not using hardware assisted paging (i.e. you are using shadow - mode) and your guest workload consists of a very large number of - similar processes then increasing this value may improve performance. - -+On Arm, this field is used to determine the size of the guest P2M pages -+pool, and the default value is 1MB per vCPU plus 4KB per MB of RAM for -+the P2M map and additional 512KB for extended regions. Users should -+adjust this value if bigger P2M pool size is needed. -+ - =back - - =head3 Processor and Platform Features -diff --git a/tools/libs/light/libxl_arch.h b/tools/libs/light/libxl_arch.h -index 1522ecb97f72..5a060c2c3033 100644 ---- a/tools/libs/light/libxl_arch.h -+++ b/tools/libs/light/libxl_arch.h -@@ -90,6 +90,10 @@ void libxl__arch_update_domain_config(libxl__gc *gc, - libxl_domain_config *dst, - const libxl_domain_config *src); - -+_hidden -+unsigned long libxl__arch_get_required_paging_memory(unsigned long maxmem_kb, -+ unsigned int smp_cpus); -+ - #if defined(__i386__) || defined(__x86_64__) - - #define LAPIC_BASE_ADDRESS 0xfee00000 -diff --git a/tools/libs/light/libxl_arm.c b/tools/libs/light/libxl_arm.c -index eef1de093914..73a95e83af24 100644 ---- a/tools/libs/light/libxl_arm.c -+++ b/tools/libs/light/libxl_arm.c -@@ -154,6 +154,20 @@ out: - return rc; - } - -+unsigned long libxl__arch_get_required_paging_memory(unsigned long maxmem_kb, -+ unsigned int smp_cpus) -+{ -+ /* -+ * 256 pages (1MB) per vcpu, -+ * plus 1 page per MiB of RAM for the P2M map, -+ * plus 1 page per MiB of extended region. This default value is 128 MiB -+ * which should be enough for domains that are not running backend. -+ * This is higher than the minimum that Xen would allocate if no value -+ * were given (but the Xen minimum is for safety, not performance). -+ */ -+ return 4 * (256 * smp_cpus + maxmem_kb / 1024 + 128); -+} -+ - static struct arch_info { - const char *guest_type; - const char *timer_compat; -diff --git a/tools/libs/light/libxl_utils.c b/tools/libs/light/libxl_utils.c -index 4699c4a0a36f..e276c0ee9cc3 100644 ---- a/tools/libs/light/libxl_utils.c -+++ b/tools/libs/light/libxl_utils.c -@@ -18,6 +18,7 @@ - #include <ctype.h> - - #include "libxl_internal.h" -+#include "libxl_arch.h" - #include "_paths.h" - - #ifndef LIBXL_HAVE_NONCONST_LIBXL_BASENAME_RETURN_VALUE -@@ -39,13 +40,7 @@ char *libxl_basename(const char *name) - - unsigned long libxl_get_required_shadow_memory(unsigned long maxmem_kb, unsigned int smp_cpus) - { -- /* 256 pages (1MB) per vcpu, -- plus 1 page per MiB of RAM for the P2M map, -- plus 1 page per MiB of RAM to shadow the resident processes. -- This is higher than the minimum that Xen would allocate if no value -- were given (but the Xen minimum is for safety, not performance). -- */ -- return 4 * (256 * smp_cpus + 2 * (maxmem_kb / 1024)); -+ return libxl__arch_get_required_paging_memory(maxmem_kb, smp_cpus); - } - - char *libxl_domid_to_name(libxl_ctx *ctx, uint32_t domid) -diff --git a/tools/libs/light/libxl_x86.c b/tools/libs/light/libxl_x86.c -index 1feadebb1852..51362893cf98 100644 ---- a/tools/libs/light/libxl_x86.c -+++ b/tools/libs/light/libxl_x86.c -@@ -882,6 +882,19 @@ void libxl__arch_update_domain_config(libxl__gc *gc, - libxl_defbool_val(src->b_info.arch_x86.msr_relaxed)); - } - -+unsigned long libxl__arch_get_required_paging_memory(unsigned long maxmem_kb, -+ unsigned int smp_cpus) -+{ -+ /* -+ * 256 pages (1MB) per vcpu, -+ * plus 1 page per MiB of RAM for the P2M map, -+ * plus 1 page per MiB of RAM to shadow the resident processes. -+ * This is higher than the minimum that Xen would allocate if no value -+ * were given (but the Xen minimum is for safety, not performance). -+ */ -+ return 4 * (256 * smp_cpus + 2 * (maxmem_kb / 1024)); -+} -+ - /* - * Local variables: - * mode: C --- -2.37.4 - diff --git a/0012-tools-ocaml-xenctrl-Use-larger-chunksize-in-domain_g.patch b/0012-tools-ocaml-xenctrl-Use-larger-chunksize-in-domain_g.patch new file mode 100644 index 0000000..fc352ad --- /dev/null +++ b/0012-tools-ocaml-xenctrl-Use-larger-chunksize-in-domain_g.patch @@ -0,0 +1,41 @@ +From 8c66a2d88a9f17e5b5099fcb83231b7a1169ca25 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= <edvin.torok@citrix.com> +Date: Tue, 1 Nov 2022 17:59:17 +0000 +Subject: [PATCH 12/61] tools/ocaml/xenctrl: Use larger chunksize in + domain_getinfolist +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +domain_getinfolist() is quadratic with the number of domains, because of the +behaviour of the underlying hypercall. Nevertheless, getting domain info in +blocks of 1024 is far more efficient than blocks of 2. + +In a scalability testing scenario with ~1000 VMs, a combination of this and +the previous change takes xenopsd's wallclock time in domain_getinfolist() +down from 88% to 0.02% + +Signed-off-by: Edwin Török <edvin.torok@citrix.com> +Tested-by: Pau Ruiz Safont <pau.safont@citrix.com> +Acked-by: Christian Lindig <christian.lindig@citrix.com> +(cherry picked from commit 95db09b1b154fb72fad861815ceae1f3fa49fc4e) +--- + tools/ocaml/libs/xc/xenctrl.ml | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/ocaml/libs/xc/xenctrl.ml b/tools/ocaml/libs/xc/xenctrl.ml +index f10b686215..b40c70d33f 100644 +--- a/tools/ocaml/libs/xc/xenctrl.ml ++++ b/tools/ocaml/libs/xc/xenctrl.ml +@@ -223,7 +223,7 @@ let rev_append_fold acc e = List.rev_append e acc + let rev_concat lst = List.fold_left rev_append_fold [] lst + + let domain_getinfolist handle first_domain = +- let nb = 2 in ++ let nb = 1024 in + let rec __getlist lst from = + (* _domain_getinfolist returns domains in reverse order, largest first *) + match _domain_getinfolist handle from nb with +-- +2.40.0 + diff --git a/0013-tools-ocaml-xb-mmap-Use-Data_abstract_val-wrapper.patch b/0013-tools-ocaml-xb-mmap-Use-Data_abstract_val-wrapper.patch new file mode 100644 index 0000000..a999dd8 --- /dev/null +++ b/0013-tools-ocaml-xb-mmap-Use-Data_abstract_val-wrapper.patch @@ -0,0 +1,75 @@ +From 049d16c8ce900dfc8f4b657849aeb82b95ed857c Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= <edvin.torok@citrix.com> +Date: Fri, 16 Dec 2022 18:25:10 +0000 +Subject: [PATCH 13/61] tools/ocaml/xb,mmap: Use Data_abstract_val wrapper +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This is not strictly necessary since it is essentially a no-op currently: a +cast to void * and value *, even in OCaml 5.0. + +However it does make it clearer that what we have here is not a regular OCaml +value, but one allocated with Abstract_tag or Custom_tag, and follows the +example from the manual more closely: +https://v2.ocaml.org/manual/intfc.html#ss:c-outside-head + +It also makes it clearer that these modules have been reviewed for +compat with OCaml 5.0. + +We cannot use OCaml finalizers here, because we want exact control over when +to unmap these pages from remote domains. + +No functional change. + +Signed-off-by: Edwin Török <edvin.torok@citrix.com> +Acked-by: Christian Lindig <christian.lindig@citrix.com> +(cherry picked from commit d2ccc637111d6dbcf808aaffeec7a46f0b1e1c81) +--- + tools/ocaml/libs/mmap/mmap_stubs.h | 4 ++++ + tools/ocaml/libs/mmap/xenmmap_stubs.c | 2 +- + tools/ocaml/libs/xb/xs_ring_stubs.c | 2 +- + 3 files changed, 6 insertions(+), 2 deletions(-) + +diff --git a/tools/ocaml/libs/mmap/mmap_stubs.h b/tools/ocaml/libs/mmap/mmap_stubs.h +index 65e4239890..f4784e4715 100644 +--- a/tools/ocaml/libs/mmap/mmap_stubs.h ++++ b/tools/ocaml/libs/mmap/mmap_stubs.h +@@ -30,4 +30,8 @@ struct mmap_interface + int len; + }; + ++#ifndef Data_abstract_val ++#define Data_abstract_val(x) ((void *)Op_val(x)) ++#endif ++ + #endif +diff --git a/tools/ocaml/libs/mmap/xenmmap_stubs.c b/tools/ocaml/libs/mmap/xenmmap_stubs.c +index e2ce088e25..e03951d781 100644 +--- a/tools/ocaml/libs/mmap/xenmmap_stubs.c ++++ b/tools/ocaml/libs/mmap/xenmmap_stubs.c +@@ -28,7 +28,7 @@ + #include <caml/fail.h> + #include <caml/callback.h> + +-#define Intf_val(a) ((struct mmap_interface *) a) ++#define Intf_val(a) ((struct mmap_interface *)Data_abstract_val(a)) + + static int mmap_interface_init(struct mmap_interface *intf, + int fd, int pflag, int mflag, +diff --git a/tools/ocaml/libs/xb/xs_ring_stubs.c b/tools/ocaml/libs/xb/xs_ring_stubs.c +index 7a91fdee75..1f58524535 100644 +--- a/tools/ocaml/libs/xb/xs_ring_stubs.c ++++ b/tools/ocaml/libs/xb/xs_ring_stubs.c +@@ -35,7 +35,7 @@ + #include <sys/mman.h> + #include "mmap_stubs.h" + +-#define GET_C_STRUCT(a) ((struct mmap_interface *) a) ++#define GET_C_STRUCT(a) ((struct mmap_interface *)Data_abstract_val(a)) + + /* + * Bytes_val has been introduced by Ocaml 4.06.1. So define our own version +-- +2.40.0 + diff --git a/0013-xen-arm-Construct-the-P2M-pages-pool-for-guests.patch b/0013-xen-arm-Construct-the-P2M-pages-pool-for-guests.patch deleted file mode 100644 index 7ab3212..0000000 --- a/0013-xen-arm-Construct-the-P2M-pages-pool-for-guests.patch +++ /dev/null @@ -1,189 +0,0 @@ -From 914fc8e8b4cc003e90d51bee0aef54687358530a Mon Sep 17 00:00:00 2001 -From: Henry Wang <Henry.Wang@arm.com> -Date: Tue, 11 Oct 2022 14:55:21 +0200 -Subject: [PATCH 13/87] xen/arm: Construct the P2M pages pool for guests - -This commit constructs the p2m pages pool for guests from the -data structure and helper perspective. - -This is implemented by: - -- Adding a `struct paging_domain` which contains a freelist, a -counter variable and a spinlock to `struct arch_domain` to -indicate the free p2m pages and the number of p2m total pages in -the p2m pages pool. - -- Adding a helper `p2m_get_allocation` to get the p2m pool size. - -- Adding a helper `p2m_set_allocation` to set the p2m pages pool -size. This helper should be called before allocating memory for -a guest. - -- Adding a helper `p2m_teardown_allocation` to free the p2m pages -pool. This helper should be called during the xl domain destory. - -This is part of CVE-2022-33747 / XSA-409. - -Signed-off-by: Henry Wang <Henry.Wang@arm.com> -Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> -master commit: 55914f7fc91a468649b8a3ec3f53ae1c4aca6670 -master date: 2022-10-11 14:28:39 +0200 ---- - xen/arch/arm/p2m.c | 88 ++++++++++++++++++++++++++++++++++++ - xen/include/asm-arm/domain.h | 10 ++++ - xen/include/asm-arm/p2m.h | 4 ++ - 3 files changed, 102 insertions(+) - -diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c -index 27418ee5ee98..d8957dd8727c 100644 ---- a/xen/arch/arm/p2m.c -+++ b/xen/arch/arm/p2m.c -@@ -50,6 +50,92 @@ static uint64_t generate_vttbr(uint16_t vmid, mfn_t root_mfn) - return (mfn_to_maddr(root_mfn) | ((uint64_t)vmid << 48)); - } - -+/* Return the size of the pool, rounded up to the nearest MB */ -+unsigned int p2m_get_allocation(struct domain *d) -+{ -+ unsigned long nr_pages = ACCESS_ONCE(d->arch.paging.p2m_total_pages); -+ -+ return ROUNDUP(nr_pages, 1 << (20 - PAGE_SHIFT)) >> (20 - PAGE_SHIFT); -+} -+ -+/* -+ * Set the pool of pages to the required number of pages. -+ * Returns 0 for success, non-zero for failure. -+ * Call with d->arch.paging.lock held. -+ */ -+int p2m_set_allocation(struct domain *d, unsigned long pages, bool *preempted) -+{ -+ struct page_info *pg; -+ -+ ASSERT(spin_is_locked(&d->arch.paging.lock)); -+ -+ for ( ; ; ) -+ { -+ if ( d->arch.paging.p2m_total_pages < pages ) -+ { -+ /* Need to allocate more memory from domheap */ -+ pg = alloc_domheap_page(NULL, 0); -+ if ( pg == NULL ) -+ { -+ printk(XENLOG_ERR "Failed to allocate P2M pages.\n"); -+ return -ENOMEM; -+ } -+ ACCESS_ONCE(d->arch.paging.p2m_total_pages) = -+ d->arch.paging.p2m_total_pages + 1; -+ page_list_add_tail(pg, &d->arch.paging.p2m_freelist); -+ } -+ else if ( d->arch.paging.p2m_total_pages > pages ) -+ { -+ /* Need to return memory to domheap */ -+ pg = page_list_remove_head(&d->arch.paging.p2m_freelist); -+ if( pg ) -+ { -+ ACCESS_ONCE(d->arch.paging.p2m_total_pages) = -+ d->arch.paging.p2m_total_pages - 1; -+ free_domheap_page(pg); -+ } -+ else -+ { -+ printk(XENLOG_ERR -+ "Failed to free P2M pages, P2M freelist is empty.\n"); -+ return -ENOMEM; -+ } -+ } -+ else -+ break; -+ -+ /* Check to see if we need to yield and try again */ -+ if ( preempted && general_preempt_check() ) -+ { -+ *preempted = true; -+ return -ERESTART; -+ } -+ } -+ -+ return 0; -+} -+ -+int p2m_teardown_allocation(struct domain *d) -+{ -+ int ret = 0; -+ bool preempted = false; -+ -+ spin_lock(&d->arch.paging.lock); -+ if ( d->arch.paging.p2m_total_pages != 0 ) -+ { -+ ret = p2m_set_allocation(d, 0, &preempted); -+ if ( preempted ) -+ { -+ spin_unlock(&d->arch.paging.lock); -+ return -ERESTART; -+ } -+ ASSERT(d->arch.paging.p2m_total_pages == 0); -+ } -+ spin_unlock(&d->arch.paging.lock); -+ -+ return ret; -+} -+ - /* Unlock the flush and do a P2M TLB flush if necessary */ - void p2m_write_unlock(struct p2m_domain *p2m) - { -@@ -1599,7 +1685,9 @@ int p2m_init(struct domain *d) - unsigned int cpu; - - rwlock_init(&p2m->lock); -+ spin_lock_init(&d->arch.paging.lock); - INIT_PAGE_LIST_HEAD(&p2m->pages); -+ INIT_PAGE_LIST_HEAD(&d->arch.paging.p2m_freelist); - - p2m->vmid = INVALID_VMID; - -diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h -index 7f8ddd3f5c3b..2f31795ab96d 100644 ---- a/xen/include/asm-arm/domain.h -+++ b/xen/include/asm-arm/domain.h -@@ -40,6 +40,14 @@ struct vtimer { - uint64_t cval; - }; - -+struct paging_domain { -+ spinlock_t lock; -+ /* Free P2M pages from the pre-allocated P2M pool */ -+ struct page_list_head p2m_freelist; -+ /* Number of pages from the pre-allocated P2M pool */ -+ unsigned long p2m_total_pages; -+}; -+ - struct arch_domain - { - #ifdef CONFIG_ARM_64 -@@ -51,6 +59,8 @@ struct arch_domain - - struct hvm_domain hvm; - -+ struct paging_domain paging; -+ - struct vmmio vmmio; - - /* Continuable domain_relinquish_resources(). */ -diff --git a/xen/include/asm-arm/p2m.h b/xen/include/asm-arm/p2m.h -index b3ba83283e11..c9598740bd02 100644 ---- a/xen/include/asm-arm/p2m.h -+++ b/xen/include/asm-arm/p2m.h -@@ -218,6 +218,10 @@ void p2m_restore_state(struct vcpu *n); - /* Print debugging/statistial info about a domain's p2m */ - void p2m_dump_info(struct domain *d); - -+unsigned int p2m_get_allocation(struct domain *d); -+int p2m_set_allocation(struct domain *d, unsigned long pages, bool *preempted); -+int p2m_teardown_allocation(struct domain *d); -+ - static inline void p2m_write_lock(struct p2m_domain *p2m) - { - write_lock(&p2m->lock); --- -2.37.4 - diff --git a/0014-tools-ocaml-xb-Drop-Xs_ring.write.patch b/0014-tools-ocaml-xb-Drop-Xs_ring.write.patch new file mode 100644 index 0000000..813f041 --- /dev/null +++ b/0014-tools-ocaml-xb-Drop-Xs_ring.write.patch @@ -0,0 +1,62 @@ +From f7c4fab9b50af74d0e1170fbf35367ced48d8209 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= <edvin.torok@citrix.com> +Date: Fri, 16 Dec 2022 18:25:20 +0000 +Subject: [PATCH 14/61] tools/ocaml/xb: Drop Xs_ring.write +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This function is unusued (only Xs_ring.write_substring is used), and the +bytes/string conversion here is backwards: the C stub implements the bytes +version and then we use a Bytes.unsafe_of_string to convert a string into +bytes. + +However the operation here really is read-only: we read from the string and +write it to the ring, so the C stub should implement the read-only string +version, and if needed we could use Bytes.unsafe_to_string to be able to send +'bytes'. However that is not necessary as the 'bytes' version is dropped above. + +Signed-off-by: Edwin Török <edvin.torok@citrix.com> +Acked-by: Christian Lindig <christian.lindig@citrix.com> +(cherry picked from commit 01f139215e678c2dc7d4bb3f9f2777069bb1b091) +--- + tools/ocaml/libs/xb/xs_ring.ml | 5 +---- + tools/ocaml/libs/xb/xs_ring_stubs.c | 2 +- + 2 files changed, 2 insertions(+), 5 deletions(-) + +diff --git a/tools/ocaml/libs/xb/xs_ring.ml b/tools/ocaml/libs/xb/xs_ring.ml +index db7f86bd27..dd5e014a33 100644 +--- a/tools/ocaml/libs/xb/xs_ring.ml ++++ b/tools/ocaml/libs/xb/xs_ring.ml +@@ -25,14 +25,11 @@ module Server_features = Set.Make(struct + end) + + external read: Xenmmap.mmap_interface -> bytes -> int -> int = "ml_interface_read" +-external write: Xenmmap.mmap_interface -> bytes -> int -> int = "ml_interface_write" ++external write_substring: Xenmmap.mmap_interface -> string -> int -> int = "ml_interface_write" + + external _internal_set_server_features: Xenmmap.mmap_interface -> int -> unit = "ml_interface_set_server_features" [@@noalloc] + external _internal_get_server_features: Xenmmap.mmap_interface -> int = "ml_interface_get_server_features" [@@noalloc] + +-let write_substring mmap buff len = +- write mmap (Bytes.unsafe_of_string buff) len +- + let get_server_features mmap = + (* NB only one feature currently defined above *) + let x = _internal_get_server_features mmap in +diff --git a/tools/ocaml/libs/xb/xs_ring_stubs.c b/tools/ocaml/libs/xb/xs_ring_stubs.c +index 1f58524535..1243c63f03 100644 +--- a/tools/ocaml/libs/xb/xs_ring_stubs.c ++++ b/tools/ocaml/libs/xb/xs_ring_stubs.c +@@ -112,7 +112,7 @@ CAMLprim value ml_interface_write(value ml_interface, + CAMLlocal1(ml_result); + + struct mmap_interface *interface = GET_C_STRUCT(ml_interface); +- const unsigned char *buffer = Bytes_val(ml_buffer); ++ const char *buffer = String_val(ml_buffer); + int len = Int_val(ml_len); + int result; + +-- +2.40.0 + diff --git a/0014-xen-arm-libxl-Implement-XEN_DOMCTL_shadow_op-for-Arm.patch b/0014-xen-arm-libxl-Implement-XEN_DOMCTL_shadow_op-for-Arm.patch deleted file mode 100644 index 0c19560..0000000 --- a/0014-xen-arm-libxl-Implement-XEN_DOMCTL_shadow_op-for-Arm.patch +++ /dev/null @@ -1,108 +0,0 @@ -From 3a16da801e14b8ff996b6f7408391ce488abd925 Mon Sep 17 00:00:00 2001 -From: Henry Wang <Henry.Wang@arm.com> -Date: Tue, 11 Oct 2022 14:55:40 +0200 -Subject: [PATCH 14/87] xen/arm, libxl: Implement XEN_DOMCTL_shadow_op for Arm - -This commit implements the `XEN_DOMCTL_shadow_op` support in Xen -for Arm. The p2m pages pool size for xl guests is supposed to be -determined by `XEN_DOMCTL_shadow_op`. Hence, this commit: - -- Introduces a function `p2m_domctl` and implements the subops -`XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION` and -`XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION` of `XEN_DOMCTL_shadow_op`. - -- Adds the `XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION` support in libxl. - -Therefore enabling the setting of shadow memory pool size -when creating a guest from xl and getting shadow memory pool size -from Xen. - -Note that the `XEN_DOMCTL_shadow_op` added in this commit is only -a dummy op, and the functionality of setting/getting p2m memory pool -size for xl guests will be added in following commits. - -This is part of CVE-2022-33747 / XSA-409. - -Signed-off-by: Henry Wang <Henry.Wang@arm.com> -Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> -master commit: cf2a68d2ffbc3ce95e01449d46180bddb10d24a0 -master date: 2022-10-11 14:28:42 +0200 ---- - tools/libs/light/libxl_arm.c | 12 ++++++++++++ - xen/arch/arm/domctl.c | 32 ++++++++++++++++++++++++++++++++ - 2 files changed, 44 insertions(+) - -diff --git a/tools/libs/light/libxl_arm.c b/tools/libs/light/libxl_arm.c -index 73a95e83af24..22a0c561bbc6 100644 ---- a/tools/libs/light/libxl_arm.c -+++ b/tools/libs/light/libxl_arm.c -@@ -131,6 +131,18 @@ int libxl__arch_domain_create(libxl__gc *gc, - libxl__domain_build_state *state, - uint32_t domid) - { -+ libxl_ctx *ctx = libxl__gc_owner(gc); -+ unsigned int shadow_mb = DIV_ROUNDUP(d_config->b_info.shadow_memkb, 1024); -+ -+ int r = xc_shadow_control(ctx->xch, domid, -+ XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION, -+ &shadow_mb, 0); -+ if (r) { -+ LOGED(ERROR, domid, -+ "Failed to set %u MiB shadow allocation", shadow_mb); -+ return ERROR_FAIL; -+ } -+ - return 0; - } - -diff --git a/xen/arch/arm/domctl.c b/xen/arch/arm/domctl.c -index 1baf25c3d98b..9bf72e693019 100644 ---- a/xen/arch/arm/domctl.c -+++ b/xen/arch/arm/domctl.c -@@ -47,11 +47,43 @@ static int handle_vuart_init(struct domain *d, - return rc; - } - -+static long p2m_domctl(struct domain *d, struct xen_domctl_shadow_op *sc, -+ XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) -+{ -+ if ( unlikely(d == current->domain) ) -+ { -+ printk(XENLOG_ERR "Tried to do a p2m domctl op on itself.\n"); -+ return -EINVAL; -+ } -+ -+ if ( unlikely(d->is_dying) ) -+ { -+ printk(XENLOG_ERR "Tried to do a p2m domctl op on dying domain %u\n", -+ d->domain_id); -+ return -EINVAL; -+ } -+ -+ switch ( sc->op ) -+ { -+ case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION: -+ return 0; -+ case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION: -+ return 0; -+ default: -+ { -+ printk(XENLOG_ERR "Bad p2m domctl op %u\n", sc->op); -+ return -EINVAL; -+ } -+ } -+} -+ - long arch_do_domctl(struct xen_domctl *domctl, struct domain *d, - XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) - { - switch ( domctl->cmd ) - { -+ case XEN_DOMCTL_shadow_op: -+ return p2m_domctl(d, &domctl->u.shadow_op, u_domctl); - case XEN_DOMCTL_cacheflush: - { - gfn_t s = _gfn(domctl->u.cacheflush.start_pfn); --- -2.37.4 - diff --git a/0015-tools-oxenstored-validate-config-file-before-live-up.patch b/0015-tools-oxenstored-validate-config-file-before-live-up.patch new file mode 100644 index 0000000..f65fbd6 --- /dev/null +++ b/0015-tools-oxenstored-validate-config-file-before-live-up.patch @@ -0,0 +1,131 @@ +From fd1c70442d3aa962be4d041d5f8fce9d2fa72ce1 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= <edvin.torok@citrix.com> +Date: Tue, 11 May 2021 15:56:50 +0000 +Subject: [PATCH 15/61] tools/oxenstored: validate config file before live + update +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The configuration file can contain typos or various errors that could prevent +live update from succeeding (e.g. a flag only valid on a different version). +Unknown entries in the config file would be ignored on startup normally, +add a strict --config-test that live-update can use to check that the config file +is valid *for the new binary*. + +For compatibility with running old code during live update recognize +--live --help as an equivalent to --config-test. + +Signed-off-by: Edwin Török <edvin.torok@citrix.com> +Acked-by: Christian Lindig <christian.lindig@citrix.com> +(cherry picked from commit e6f07052ce4a0f0b7d4dc522d87465efb2d9ee86) +--- + tools/ocaml/xenstored/parse_arg.ml | 26 ++++++++++++++++++++++++++ + tools/ocaml/xenstored/xenstored.ml | 11 +++++++++-- + 2 files changed, 35 insertions(+), 2 deletions(-) + +diff --git a/tools/ocaml/xenstored/parse_arg.ml b/tools/ocaml/xenstored/parse_arg.ml +index 7c0478e76a..5e4ca6f1f7 100644 +--- a/tools/ocaml/xenstored/parse_arg.ml ++++ b/tools/ocaml/xenstored/parse_arg.ml +@@ -26,8 +26,14 @@ type config = + restart: bool; + live_reload: bool; + disable_socket: bool; ++ config_test: bool; + } + ++let get_config_filename config_file = ++ match config_file with ++ | Some name -> name ++ | None -> Define.default_config_dir ^ "/oxenstored.conf" ++ + let do_argv = + let pidfile = ref "" and tracefile = ref "" (* old xenstored compatibility *) + and domain_init = ref true +@@ -38,6 +44,8 @@ let do_argv = + and restart = ref false + and live_reload = ref false + and disable_socket = ref false ++ and config_test = ref false ++ and help = ref false + in + + let speclist = +@@ -55,10 +63,27 @@ let do_argv = + ("-T", Arg.Set_string tracefile, ""); (* for compatibility *) + ("--restart", Arg.Set restart, "Read database on starting"); + ("--live", Arg.Set live_reload, "Read live dump on startup"); ++ ("--config-test", Arg.Set config_test, "Test validity of config file"); + ("--disable-socket", Arg.Unit (fun () -> disable_socket := true), "Disable socket"); ++ ("--help", Arg.Set help, "Display this list of options") + ] in + let usage_msg = "usage : xenstored [--config-file <filename>] [--no-domain-init] [--help] [--no-fork] [--reraise-top-level] [--restart] [--disable-socket]" in + Arg.parse speclist (fun _ -> ()) usage_msg; ++ let () = ++ if !help then begin ++ if !live_reload then ++ (* ++ * Transform --live --help into --config-test for backward compat with ++ * running code during live update. ++ * Caller will validate config and exit ++ *) ++ config_test := true ++ else begin ++ Arg.usage_string speclist usage_msg |> print_endline; ++ exit 0 ++ end ++ end ++ in + { + domain_init = !domain_init; + activate_access_log = !activate_access_log; +@@ -70,4 +95,5 @@ let do_argv = + restart = !restart; + live_reload = !live_reload; + disable_socket = !disable_socket; ++ config_test = !config_test; + } +diff --git a/tools/ocaml/xenstored/xenstored.ml b/tools/ocaml/xenstored/xenstored.ml +index 4d5851c5cb..e2638a5af2 100644 +--- a/tools/ocaml/xenstored/xenstored.ml ++++ b/tools/ocaml/xenstored/xenstored.ml +@@ -88,7 +88,7 @@ let default_pidfile = Paths.xen_run_dir ^ "/xenstored.pid" + + let ring_scan_interval = ref 20 + +-let parse_config filename = ++let parse_config ?(strict=false) filename = + let pidfile = ref default_pidfile in + let options = [ + ("merge-activate", Config.Set_bool Transaction.do_coalesce); +@@ -129,11 +129,12 @@ let parse_config filename = + ("xenstored-port", Config.Set_string Domains.xenstored_port); ] in + begin try Config.read filename options (fun _ _ -> raise Not_found) + with +- | Config.Error err -> List.iter (fun (k, e) -> ++ | Config.Error err as e -> List.iter (fun (k, e) -> + match e with + | "unknown key" -> eprintf "config: unknown key %s\n" k + | _ -> eprintf "config: %s: %s\n" k e + ) err; ++ if strict then raise e + | Sys_error m -> eprintf "error: config: %s\n" m; + end; + !pidfile +@@ -358,6 +359,12 @@ let tweak_gc () = + let () = + Printexc.set_uncaught_exception_handler Logging.fallback_exception_handler; + let cf = do_argv in ++ if cf.config_test then begin ++ let path = config_filename cf in ++ let _pidfile:string = parse_config ~strict:true path in ++ Printf.printf "Configuration valid at %s\n%!" path; ++ exit 0 ++ end; + let pidfile = + if Sys.file_exists (config_filename cf) then + parse_config (config_filename cf) +-- +2.40.0 + diff --git a/0015-xen-arm-Allocate-and-free-P2M-pages-from-the-P2M-poo.patch b/0015-xen-arm-Allocate-and-free-P2M-pages-from-the-P2M-poo.patch deleted file mode 100644 index 7472b4b..0000000 --- a/0015-xen-arm-Allocate-and-free-P2M-pages-from-the-P2M-poo.patch +++ /dev/null @@ -1,289 +0,0 @@ -From 44e9dcc48b81bca202a5b31926125a6a59a4c72e Mon Sep 17 00:00:00 2001 -From: Henry Wang <Henry.Wang@arm.com> -Date: Tue, 11 Oct 2022 14:55:53 +0200 -Subject: [PATCH 15/87] xen/arm: Allocate and free P2M pages from the P2M pool - -This commit sets/tearsdown of p2m pages pool for non-privileged Arm -guests by calling `p2m_set_allocation` and `p2m_teardown_allocation`. - -- For dom0, P2M pages should come from heap directly instead of p2m -pool, so that the kernel may take advantage of the extended regions. - -- For xl guests, the setting of the p2m pool is called in -`XEN_DOMCTL_shadow_op` and the p2m pool is destroyed in -`domain_relinquish_resources`. Note that domctl->u.shadow_op.mb is -updated with the new size when setting the p2m pool. - -- For dom0less domUs, the setting of the p2m pool is called before -allocating memory during domain creation. Users can specify the p2m -pool size by `xen,domain-p2m-mem-mb` dts property. - -To actually allocate/free pages from the p2m pool, this commit adds -two helper functions namely `p2m_alloc_page` and `p2m_free_page` to -`struct p2m_domain`. By replacing the `alloc_domheap_page` and -`free_domheap_page` with these two helper functions, p2m pages can -be added/removed from the list of p2m pool rather than from the heap. - -Since page from `p2m_alloc_page` is cleaned, take the opportunity -to remove the redundant `clean_page` in `p2m_create_table`. - -This is part of CVE-2022-33747 / XSA-409. - -Signed-off-by: Henry Wang <Henry.Wang@arm.com> -Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> -master commit: cbea5a1149ca7fd4b7cdbfa3ec2e4f109b601ff7 -master date: 2022-10-11 14:28:44 +0200 ---- - docs/misc/arm/device-tree/booting.txt | 8 ++++ - xen/arch/arm/domain.c | 6 +++ - xen/arch/arm/domain_build.c | 29 ++++++++++++++ - xen/arch/arm/domctl.c | 23 ++++++++++- - xen/arch/arm/p2m.c | 57 +++++++++++++++++++++++++-- - 5 files changed, 118 insertions(+), 5 deletions(-) - -diff --git a/docs/misc/arm/device-tree/booting.txt b/docs/misc/arm/device-tree/booting.txt -index 71895663a4de..d92ccc56ffe0 100644 ---- a/docs/misc/arm/device-tree/booting.txt -+++ b/docs/misc/arm/device-tree/booting.txt -@@ -182,6 +182,14 @@ with the following properties: - Both #address-cells and #size-cells need to be specified because - both sub-nodes (described shortly) have reg properties. - -+- xen,domain-p2m-mem-mb -+ -+ Optional. A 32-bit integer specifying the amount of megabytes of RAM -+ used for the domain P2M pool. This is in-sync with the shadow_memory -+ option in xl.cfg. Leaving this field empty in device tree will lead to -+ the default size of domain P2M pool, i.e. 1MB per guest vCPU plus 4KB -+ per MB of guest RAM plus 512KB for guest extended regions. -+ - Under the "xen,domain" compatible node, one or more sub-nodes are present - for the DomU kernel and ramdisk. - -diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c -index 2694c39127c5..a818f33a1afa 100644 ---- a/xen/arch/arm/domain.c -+++ b/xen/arch/arm/domain.c -@@ -997,6 +997,7 @@ enum { - PROG_page, - PROG_mapping, - PROG_p2m, -+ PROG_p2m_pool, - PROG_done, - }; - -@@ -1062,6 +1063,11 @@ int domain_relinquish_resources(struct domain *d) - if ( ret ) - return ret; - -+ PROGRESS(p2m_pool): -+ ret = p2m_teardown_allocation(d); -+ if( ret ) -+ return ret; -+ - PROGRESS(done): - break; - -diff --git a/xen/arch/arm/domain_build.c b/xen/arch/arm/domain_build.c -index d02bacbcd1ed..8aec3755ca5d 100644 ---- a/xen/arch/arm/domain_build.c -+++ b/xen/arch/arm/domain_build.c -@@ -2833,6 +2833,21 @@ static void __init find_gnttab_region(struct domain *d, - kinfo->gnttab_start, kinfo->gnttab_start + kinfo->gnttab_size); - } - -+static unsigned long __init domain_p2m_pages(unsigned long maxmem_kb, -+ unsigned int smp_cpus) -+{ -+ /* -+ * Keep in sync with libxl__get_required_paging_memory(). -+ * 256 pages (1MB) per vcpu, plus 1 page per MiB of RAM for the P2M map, -+ * plus 128 pages to cover extended regions. -+ */ -+ unsigned long memkb = 4 * (256 * smp_cpus + (maxmem_kb / 1024) + 128); -+ -+ BUILD_BUG_ON(PAGE_SIZE != SZ_4K); -+ -+ return DIV_ROUND_UP(memkb, 1024) << (20 - PAGE_SHIFT); -+} -+ - static int __init construct_domain(struct domain *d, struct kernel_info *kinfo) - { - unsigned int i; -@@ -2924,6 +2939,8 @@ static int __init construct_domU(struct domain *d, - struct kernel_info kinfo = {}; - int rc; - u64 mem; -+ u32 p2m_mem_mb; -+ unsigned long p2m_pages; - - rc = dt_property_read_u64(node, "memory", &mem); - if ( !rc ) -@@ -2933,6 +2950,18 @@ static int __init construct_domU(struct domain *d, - } - kinfo.unassigned_mem = (paddr_t)mem * SZ_1K; - -+ rc = dt_property_read_u32(node, "xen,domain-p2m-mem-mb", &p2m_mem_mb); -+ /* If xen,domain-p2m-mem-mb is not specified, use the default value. */ -+ p2m_pages = rc ? -+ p2m_mem_mb << (20 - PAGE_SHIFT) : -+ domain_p2m_pages(mem, d->max_vcpus); -+ -+ spin_lock(&d->arch.paging.lock); -+ rc = p2m_set_allocation(d, p2m_pages, NULL); -+ spin_unlock(&d->arch.paging.lock); -+ if ( rc != 0 ) -+ return rc; -+ - printk("*** LOADING DOMU cpus=%u memory=%"PRIx64"KB ***\n", d->max_vcpus, mem); - - kinfo.vpl011 = dt_property_read_bool(node, "vpl011"); -diff --git a/xen/arch/arm/domctl.c b/xen/arch/arm/domctl.c -index 9bf72e693019..c8fdeb124084 100644 ---- a/xen/arch/arm/domctl.c -+++ b/xen/arch/arm/domctl.c -@@ -50,6 +50,9 @@ static int handle_vuart_init(struct domain *d, - static long p2m_domctl(struct domain *d, struct xen_domctl_shadow_op *sc, - XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) - { -+ long rc; -+ bool preempted = false; -+ - if ( unlikely(d == current->domain) ) - { - printk(XENLOG_ERR "Tried to do a p2m domctl op on itself.\n"); -@@ -66,9 +69,27 @@ static long p2m_domctl(struct domain *d, struct xen_domctl_shadow_op *sc, - switch ( sc->op ) - { - case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION: -- return 0; -+ { -+ /* Allow and handle preemption */ -+ spin_lock(&d->arch.paging.lock); -+ rc = p2m_set_allocation(d, sc->mb << (20 - PAGE_SHIFT), &preempted); -+ spin_unlock(&d->arch.paging.lock); -+ -+ if ( preempted ) -+ /* Not finished. Set up to re-run the call. */ -+ rc = hypercall_create_continuation(__HYPERVISOR_domctl, "h", -+ u_domctl); -+ else -+ /* Finished. Return the new allocation. */ -+ sc->mb = p2m_get_allocation(d); -+ -+ return rc; -+ } - case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION: -+ { -+ sc->mb = p2m_get_allocation(d); - return 0; -+ } - default: - { - printk(XENLOG_ERR "Bad p2m domctl op %u\n", sc->op); -diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c -index d8957dd8727c..b2d856a801af 100644 ---- a/xen/arch/arm/p2m.c -+++ b/xen/arch/arm/p2m.c -@@ -50,6 +50,54 @@ static uint64_t generate_vttbr(uint16_t vmid, mfn_t root_mfn) - return (mfn_to_maddr(root_mfn) | ((uint64_t)vmid << 48)); - } - -+static struct page_info *p2m_alloc_page(struct domain *d) -+{ -+ struct page_info *pg; -+ -+ spin_lock(&d->arch.paging.lock); -+ /* -+ * For hardware domain, there should be no limit in the number of pages that -+ * can be allocated, so that the kernel may take advantage of the extended -+ * regions. Hence, allocate p2m pages for hardware domains from heap. -+ */ -+ if ( is_hardware_domain(d) ) -+ { -+ pg = alloc_domheap_page(NULL, 0); -+ if ( pg == NULL ) -+ { -+ printk(XENLOG_G_ERR "Failed to allocate P2M pages for hwdom.\n"); -+ spin_unlock(&d->arch.paging.lock); -+ return NULL; -+ } -+ } -+ else -+ { -+ pg = page_list_remove_head(&d->arch.paging.p2m_freelist); -+ if ( unlikely(!pg) ) -+ { -+ spin_unlock(&d->arch.paging.lock); -+ return NULL; -+ } -+ d->arch.paging.p2m_total_pages--; -+ } -+ spin_unlock(&d->arch.paging.lock); -+ -+ return pg; -+} -+ -+static void p2m_free_page(struct domain *d, struct page_info *pg) -+{ -+ spin_lock(&d->arch.paging.lock); -+ if ( is_hardware_domain(d) ) -+ free_domheap_page(pg); -+ else -+ { -+ d->arch.paging.p2m_total_pages++; -+ page_list_add_tail(pg, &d->arch.paging.p2m_freelist); -+ } -+ spin_unlock(&d->arch.paging.lock); -+} -+ - /* Return the size of the pool, rounded up to the nearest MB */ - unsigned int p2m_get_allocation(struct domain *d) - { -@@ -751,7 +799,7 @@ static int p2m_create_table(struct p2m_domain *p2m, lpae_t *entry) - - ASSERT(!p2m_is_valid(*entry)); - -- page = alloc_domheap_page(NULL, 0); -+ page = p2m_alloc_page(p2m->domain); - if ( page == NULL ) - return -ENOMEM; - -@@ -878,7 +926,7 @@ static void p2m_free_entry(struct p2m_domain *p2m, - pg = mfn_to_page(mfn); - - page_list_del(pg, &p2m->pages); -- free_domheap_page(pg); -+ p2m_free_page(p2m->domain, pg); - } - - static bool p2m_split_superpage(struct p2m_domain *p2m, lpae_t *entry, -@@ -902,7 +950,7 @@ static bool p2m_split_superpage(struct p2m_domain *p2m, lpae_t *entry, - ASSERT(level < target); - ASSERT(p2m_is_superpage(*entry, level)); - -- page = alloc_domheap_page(NULL, 0); -+ page = p2m_alloc_page(p2m->domain); - if ( !page ) - return false; - -@@ -1641,7 +1689,7 @@ int p2m_teardown(struct domain *d) - - while ( (pg = page_list_remove_head(&p2m->pages)) ) - { -- free_domheap_page(pg); -+ p2m_free_page(p2m->domain, pg); - count++; - /* Arbitrarily preempt every 512 iterations */ - if ( !(count % 512) && hypercall_preempt_check() ) -@@ -1665,6 +1713,7 @@ void p2m_final_teardown(struct domain *d) - return; - - ASSERT(page_list_empty(&p2m->pages)); -+ ASSERT(page_list_empty(&d->arch.paging.p2m_freelist)); - - if ( p2m->root ) - free_domheap_pages(p2m->root, P2M_ROOT_ORDER); --- -2.37.4 - diff --git a/0016-gnttab-correct-locking-on-transitive-grant-copy-erro.patch b/0016-gnttab-correct-locking-on-transitive-grant-copy-erro.patch deleted file mode 100644 index dfb46a9..0000000 --- a/0016-gnttab-correct-locking-on-transitive-grant-copy-erro.patch +++ /dev/null @@ -1,66 +0,0 @@ -From 32cb81501c8b858fe9a451650804ec3024a8b364 Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Tue, 11 Oct 2022 14:56:29 +0200 -Subject: [PATCH 16/87] gnttab: correct locking on transitive grant copy error - path - -While the comment next to the lock dropping in preparation of -recursively calling acquire_grant_for_copy() mistakenly talks about the -rd == td case (excluded a few lines further up), the same concerns apply -to the calling of release_grant_for_copy() on a subsequent error path. - -This is CVE-2022-33748 / XSA-411. - -Fixes: ad48fb963dbf ("gnttab: fix transitive grant handling") -Signed-off-by: Jan Beulich <jbeulich@suse.com> -master commit: 6e3aab858eef614a21a782a3b73acc88e74690ea -master date: 2022-10-11 14:29:30 +0200 ---- - xen/common/grant_table.c | 19 ++++++++++++++++--- - 1 file changed, 16 insertions(+), 3 deletions(-) - -diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c -index 4c742cd8fe81..d8ca645b96ff 100644 ---- a/xen/common/grant_table.c -+++ b/xen/common/grant_table.c -@@ -2613,9 +2613,8 @@ acquire_grant_for_copy( - trans_domid); - - /* -- * acquire_grant_for_copy() could take the lock on the -- * remote table (if rd == td), so we have to drop the lock -- * here and reacquire. -+ * acquire_grant_for_copy() will take the lock on the remote table, -+ * so we have to drop the lock here and reacquire. - */ - active_entry_release(act); - grant_read_unlock(rgt); -@@ -2652,11 +2651,25 @@ acquire_grant_for_copy( - act->trans_gref != trans_gref || - !act->is_sub_page)) ) - { -+ /* -+ * Like above for acquire_grant_for_copy() we need to drop and then -+ * re-acquire the locks here to prevent lock order inversion issues. -+ * Unlike for acquire_grant_for_copy() we don't need to re-check -+ * anything, as release_grant_for_copy() doesn't depend on the grant -+ * table entry: It only updates internal state and the status flags. -+ */ -+ active_entry_release(act); -+ grant_read_unlock(rgt); -+ - release_grant_for_copy(td, trans_gref, readonly); - rcu_unlock_domain(td); -+ -+ grant_read_lock(rgt); -+ act = active_entry_acquire(rgt, gref); - reduce_status_for_pin(rd, act, status, readonly); - active_entry_release(act); - grant_read_unlock(rgt); -+ - put_page(*page); - *page = NULL; - return ERESTART; --- -2.37.4 - diff --git a/0016-tools-ocaml-libs-Don-t-declare-stubs-as-taking-void.patch b/0016-tools-ocaml-libs-Don-t-declare-stubs-as-taking-void.patch new file mode 100644 index 0000000..a64d657 --- /dev/null +++ b/0016-tools-ocaml-libs-Don-t-declare-stubs-as-taking-void.patch @@ -0,0 +1,61 @@ +From 552e5f28d411c1a1a92f2fd3592a76e74f47610b Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= <edwin.torok@cloud.com> +Date: Thu, 12 Jan 2023 11:28:29 +0000 +Subject: [PATCH 16/61] tools/ocaml/libs: Don't declare stubs as taking void +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +There is no such thing as an Ocaml function (C stub or otherwise) taking no +parameters. In the absence of any other parameters, unit is still passed. + +This doesn't explode with any ABI we care about, but would malfunction for an +ABI environment such as stdcall. + +Fixes: c3afd398ba7f ("ocaml: Add XS bindings.") +Fixes: 8b7ce06a2d34 ("ocaml: Add XC bindings.") +Signed-off-by: Edwin Török <edwin.torok@cloud.com> +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Acked-by: Christian Lindig <christian.lindig@citrix.com> +(cherry picked from commit ff8b560be80b9211c303d74df7e4b3921d2bb8ca) +--- + tools/ocaml/libs/xb/xenbus_stubs.c | 5 ++--- + tools/ocaml/libs/xc/xenctrl_stubs.c | 4 ++-- + 2 files changed, 4 insertions(+), 5 deletions(-) + +diff --git a/tools/ocaml/libs/xb/xenbus_stubs.c b/tools/ocaml/libs/xb/xenbus_stubs.c +index 3065181a55..97116b0782 100644 +--- a/tools/ocaml/libs/xb/xenbus_stubs.c ++++ b/tools/ocaml/libs/xb/xenbus_stubs.c +@@ -30,10 +30,9 @@ + #include <xenctrl.h> + #include <xen/io/xs_wire.h> + +-CAMLprim value stub_header_size(void) ++CAMLprim value stub_header_size(value unit) + { +- CAMLparam0(); +- CAMLreturn(Val_int(sizeof(struct xsd_sockmsg))); ++ return Val_int(sizeof(struct xsd_sockmsg)); + } + + CAMLprim value stub_header_of_string(value s) +diff --git a/tools/ocaml/libs/xc/xenctrl_stubs.c b/tools/ocaml/libs/xc/xenctrl_stubs.c +index 5b4fe72c8d..434fc0345b 100644 +--- a/tools/ocaml/libs/xc/xenctrl_stubs.c ++++ b/tools/ocaml/libs/xc/xenctrl_stubs.c +@@ -67,9 +67,9 @@ static void Noreturn failwith_xc(xc_interface *xch) + caml_raise_with_string(*caml_named_value("xc.error"), error_str); + } + +-CAMLprim value stub_xc_interface_open(void) ++CAMLprim value stub_xc_interface_open(value unit) + { +- CAMLparam0(); ++ CAMLparam1(unit); + xc_interface *xch; + + /* Don't assert XC_OPENFLAG_NON_REENTRANT because these bindings +-- +2.40.0 + diff --git a/0017-tools-libxl-Replace-deprecated-soundhw-on-QEMU-comma.patch b/0017-tools-libxl-Replace-deprecated-soundhw-on-QEMU-comma.patch deleted file mode 100644 index 8133c53..0000000 --- a/0017-tools-libxl-Replace-deprecated-soundhw-on-QEMU-comma.patch +++ /dev/null @@ -1,112 +0,0 @@ -From e85e2a3c17b6cd38de041cdaf14d9efdcdabad1a Mon Sep 17 00:00:00 2001 -From: Anthony PERARD <anthony.perard@citrix.com> -Date: Tue, 11 Oct 2022 14:59:10 +0200 -Subject: [PATCH 17/87] tools/libxl: Replace deprecated -soundhw on QEMU - command line - --soundhw is deprecated since 825ff02911c9 ("audio: add soundhw -deprecation notice"), QEMU v5.1, and is been remove for upcoming v7.1 -by 039a68373c45 ("introduce -audio as a replacement for -soundhw"). - -Instead we can just add the sound card with "-device", for most option -that "-soundhw" could handle. "-device" is an option that existed -before QEMU 1.0, and could already be used to add audio hardware. - -The list of possible option for libxl's "soundhw" is taken the list -from QEMU 7.0. - -The list of options for "soundhw" are listed in order of preference in -the manual. The first three (hda, ac97, es1370) are PCI devices and -easy to test on Linux, and the last four are ISA devices which doesn't -seems to work out of the box on linux. - -The sound card 'pcspk' isn't listed even if it used to be accepted by -'-soundhw' because QEMU crash when trying to add it to a Xen domain. -Also, it wouldn't work with "-device" might need to be "-machine -pcspk-audiodev=default" instead. - -Signed-off-by: Anthony PERARD <anthony.perard@citrix.com> -Reviewed-by: Jason Andryuk <jandryuk@gmail.com> -master commit: 62ca138c2c052187783aca3957d3f47c4dcfd683 -master date: 2022-08-18 09:25:50 +0200 ---- - docs/man/xl.cfg.5.pod.in | 6 +++--- - tools/libs/light/libxl_dm.c | 19 ++++++++++++++++++- - tools/libs/light/libxl_types_internal.idl | 10 ++++++++++ - 3 files changed, 31 insertions(+), 4 deletions(-) - -diff --git a/docs/man/xl.cfg.5.pod.in b/docs/man/xl.cfg.5.pod.in -index eda1e77ebd06..ab7541f22c3e 100644 ---- a/docs/man/xl.cfg.5.pod.in -+++ b/docs/man/xl.cfg.5.pod.in -@@ -2545,9 +2545,9 @@ The form serial=DEVICE is also accepted for backwards compatibility. - - =item B<soundhw="DEVICE"> - --Select the virtual sound card to expose to the guest. The valid --devices are defined by the device model configuration, please see the --B<qemu(1)> manpage for details. The default is not to export any sound -+Select the virtual sound card to expose to the guest. The valid devices are -+B<hda>, B<ac97>, B<es1370>, B<adlib>, B<cs4231a>, B<gus>, B<sb16> if there are -+available with the device model QEMU. The default is not to export any sound - device. - - =item B<vkb_device=BOOLEAN> -diff --git a/tools/libs/light/libxl_dm.c b/tools/libs/light/libxl_dm.c -index 04bf5d85632e..fc264a3a13a6 100644 ---- a/tools/libs/light/libxl_dm.c -+++ b/tools/libs/light/libxl_dm.c -@@ -1204,6 +1204,7 @@ static int libxl__build_device_model_args_new(libxl__gc *gc, - uint64_t ram_size; - const char *path, *chardev; - bool is_stubdom = libxl_defbool_val(b_info->device_model_stubdomain); -+ int rc; - - dm_args = flexarray_make(gc, 16, 1); - dm_envs = flexarray_make(gc, 16, 1); -@@ -1531,7 +1532,23 @@ static int libxl__build_device_model_args_new(libxl__gc *gc, - } - } - if (b_info->u.hvm.soundhw) { -- flexarray_vappend(dm_args, "-soundhw", b_info->u.hvm.soundhw, NULL); -+ libxl__qemu_soundhw soundhw; -+ -+ rc = libxl__qemu_soundhw_from_string(b_info->u.hvm.soundhw, &soundhw); -+ if (rc) { -+ LOGD(ERROR, guest_domid, "Unknown soundhw option '%s'", b_info->u.hvm.soundhw); -+ return ERROR_INVAL; -+ } -+ -+ switch (soundhw) { -+ case LIBXL__QEMU_SOUNDHW_HDA: -+ flexarray_vappend(dm_args, "-device", "intel-hda", -+ "-device", "hda-duplex", NULL); -+ break; -+ default: -+ flexarray_append_pair(dm_args, "-device", -+ (char*)libxl__qemu_soundhw_to_string(soundhw)); -+ } - } - if (!libxl__acpi_defbool_val(b_info)) { - flexarray_append(dm_args, "-no-acpi"); -diff --git a/tools/libs/light/libxl_types_internal.idl b/tools/libs/light/libxl_types_internal.idl -index 3593e21dbb64..caa08d3229cd 100644 ---- a/tools/libs/light/libxl_types_internal.idl -+++ b/tools/libs/light/libxl_types_internal.idl -@@ -55,3 +55,13 @@ libxl__device_action = Enumeration("device_action", [ - (1, "ADD"), - (2, "REMOVE"), - ]) -+ -+libxl__qemu_soundhw = Enumeration("qemu_soundhw", [ -+ (1, "ac97"), -+ (2, "adlib"), -+ (3, "cs4231a"), -+ (4, "es1370"), -+ (5, "gus"), -+ (6, "hda"), -+ (7, "sb16"), -+ ]) --- -2.37.4 - diff --git a/0017-tools-ocaml-libs-Allocate-the-correct-amount-of-memo.patch b/0017-tools-ocaml-libs-Allocate-the-correct-amount-of-memo.patch new file mode 100644 index 0000000..9fa8d08 --- /dev/null +++ b/0017-tools-ocaml-libs-Allocate-the-correct-amount-of-memo.patch @@ -0,0 +1,80 @@ +From 6d66fb984cc768406158353cabf9a55652b0dea7 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper <andrew.cooper3@citrix.com> +Date: Tue, 31 Jan 2023 10:59:42 +0000 +Subject: [PATCH 17/61] tools/ocaml/libs: Allocate the correct amount of memory + for Abstract_tag + +caml_alloc() takes units of Wsize (word size), not bytes. As a consequence, +we're allocating 4 or 8 times too much memory. + +Ocaml has a helper, Wsize_bsize(), but it truncates cases which aren't an +exact multiple. Use a BUILD_BUG_ON() to cover the potential for truncation, +as there's no rounding-up form of the helper. + +Fixes: 8b7ce06a2d34 ("ocaml: Add XC bindings.") +Fixes: d3e649277a13 ("ocaml: add mmap bindings implementation.") +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Acked-by: Christian Lindig <christian.lindig@citrix.com> +(cherry picked from commit 36eb2de31b6ecb8787698fb1a701bd708c8971b2) +--- + tools/ocaml/libs/mmap/Makefile | 2 ++ + tools/ocaml/libs/mmap/xenmmap_stubs.c | 6 +++++- + tools/ocaml/libs/xc/xenctrl_stubs.c | 5 ++++- + 3 files changed, 11 insertions(+), 2 deletions(-) + +diff --git a/tools/ocaml/libs/mmap/Makefile b/tools/ocaml/libs/mmap/Makefile +index df45819df5..a3bd75e33a 100644 +--- a/tools/ocaml/libs/mmap/Makefile ++++ b/tools/ocaml/libs/mmap/Makefile +@@ -2,6 +2,8 @@ TOPLEVEL=$(CURDIR)/../.. + XEN_ROOT=$(TOPLEVEL)/../.. + include $(TOPLEVEL)/common.make + ++CFLAGS += $(CFLAGS_xeninclude) ++ + OBJS = xenmmap + INTF = $(foreach obj, $(OBJS),$(obj).cmi) + LIBS = xenmmap.cma xenmmap.cmxa +diff --git a/tools/ocaml/libs/mmap/xenmmap_stubs.c b/tools/ocaml/libs/mmap/xenmmap_stubs.c +index e03951d781..d623ad390e 100644 +--- a/tools/ocaml/libs/mmap/xenmmap_stubs.c ++++ b/tools/ocaml/libs/mmap/xenmmap_stubs.c +@@ -21,6 +21,8 @@ + #include <errno.h> + #include "mmap_stubs.h" + ++#include <xen-tools/libs.h> ++ + #include <caml/mlvalues.h> + #include <caml/memory.h> + #include <caml/alloc.h> +@@ -59,7 +61,9 @@ CAMLprim value stub_mmap_init(value fd, value pflag, value mflag, + default: caml_invalid_argument("maptype"); + } + +- result = caml_alloc(sizeof(struct mmap_interface), Abstract_tag); ++ BUILD_BUG_ON((sizeof(struct mmap_interface) % sizeof(value)) != 0); ++ result = caml_alloc(Wsize_bsize(sizeof(struct mmap_interface)), ++ Abstract_tag); + + if (mmap_interface_init(Intf_val(result), Int_val(fd), + c_pflag, c_mflag, +diff --git a/tools/ocaml/libs/xc/xenctrl_stubs.c b/tools/ocaml/libs/xc/xenctrl_stubs.c +index 434fc0345b..ec64341a9a 100644 +--- a/tools/ocaml/libs/xc/xenctrl_stubs.c ++++ b/tools/ocaml/libs/xc/xenctrl_stubs.c +@@ -940,7 +940,10 @@ CAMLprim value stub_map_foreign_range(value xch, value dom, + uint32_t c_dom; + unsigned long c_mfn; + +- result = caml_alloc(sizeof(struct mmap_interface), Abstract_tag); ++ BUILD_BUG_ON((sizeof(struct mmap_interface) % sizeof(value)) != 0); ++ result = caml_alloc(Wsize_bsize(sizeof(struct mmap_interface)), ++ Abstract_tag); ++ + intf = (struct mmap_interface *) result; + + intf->len = Int_val(size); +-- +2.40.0 + diff --git a/0018-tools-ocaml-evtchn-Don-t-reference-Custom-objects-wi.patch b/0018-tools-ocaml-evtchn-Don-t-reference-Custom-objects-wi.patch new file mode 100644 index 0000000..8e1c860 --- /dev/null +++ b/0018-tools-ocaml-evtchn-Don-t-reference-Custom-objects-wi.patch @@ -0,0 +1,213 @@ +From e18faeb91e620624106b94c8821f8c9574eddb17 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= <edwin.torok@cloud.com> +Date: Thu, 12 Jan 2023 17:48:29 +0000 +Subject: [PATCH 18/61] tools/ocaml/evtchn: Don't reference Custom objects with + the GC lock released +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The modification to the _H() macro for Ocaml 5 support introduced a subtle +bug. From the manual: + + https://ocaml.org/manual/intfc.html#ss:parallel-execution-long-running-c-code + +"After caml_release_runtime_system() was called and until +caml_acquire_runtime_system() is called, the C code must not access any OCaml +data, nor call any function of the run-time system, nor call back into OCaml +code." + +Previously, the value was a naked C pointer, so dereferencing it wasn't +"accessing any Ocaml data", but the fix to avoid naked C pointers added a +layer of indirection through an Ocaml Custom object, meaning that the common +pattern of using _H() in a blocking section is unsafe. + +In order to fix: + + * Drop the _H() macro and replace it with a static inline xce_of_val(). + * Opencode the assignment into Data_custom_val() in the two constructors. + * Rename "value xce" parameters to "value xce_val" so we can consistently + have "xenevtchn_handle *xce" on the stack, and obtain the pointer with the + GC lock still held. + +Fixes: 22d5affdf0ce ("tools/ocaml/evtchn: OCaml 5 support, fix potential resource leak") +Signed-off-by: Edwin Török <edwin.torok@cloud.com> +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Acked-by: Christian Lindig <christian.lindig@citrix.com> +(cherry picked from commit 2636d8ff7a670c4d2485757dbe966e36c259a960) +--- + tools/ocaml/libs/eventchn/xeneventchn_stubs.c | 60 +++++++++++-------- + 1 file changed, 35 insertions(+), 25 deletions(-) + +diff --git a/tools/ocaml/libs/eventchn/xeneventchn_stubs.c b/tools/ocaml/libs/eventchn/xeneventchn_stubs.c +index aa8a69cc1e..d7881ca95f 100644 +--- a/tools/ocaml/libs/eventchn/xeneventchn_stubs.c ++++ b/tools/ocaml/libs/eventchn/xeneventchn_stubs.c +@@ -33,11 +33,14 @@ + #include <caml/fail.h> + #include <caml/signals.h> + +-#define _H(__h) (*((xenevtchn_handle **)Data_custom_val(__h))) ++static inline xenevtchn_handle *xce_of_val(value v) ++{ ++ return *(xenevtchn_handle **)Data_custom_val(v); ++} + + static void stub_evtchn_finalize(value v) + { +- xenevtchn_close(_H(v)); ++ xenevtchn_close(xce_of_val(v)); + } + + static struct custom_operations xenevtchn_ops = { +@@ -68,7 +71,7 @@ CAMLprim value stub_eventchn_init(value cloexec) + caml_failwith("open failed"); + + result = caml_alloc_custom(&xenevtchn_ops, sizeof(xce), 0, 1); +- _H(result) = xce; ++ *(xenevtchn_handle **)Data_custom_val(result) = xce; + + CAMLreturn(result); + } +@@ -87,18 +90,19 @@ CAMLprim value stub_eventchn_fdopen(value fdval) + caml_failwith("evtchn fdopen failed"); + + result = caml_alloc_custom(&xenevtchn_ops, sizeof(xce), 0, 1); +- _H(result) = xce; ++ *(xenevtchn_handle **)Data_custom_val(result) = xce; + + CAMLreturn(result); + } + +-CAMLprim value stub_eventchn_fd(value xce) ++CAMLprim value stub_eventchn_fd(value xce_val) + { +- CAMLparam1(xce); ++ CAMLparam1(xce_val); + CAMLlocal1(result); ++ xenevtchn_handle *xce = xce_of_val(xce_val); + int fd; + +- fd = xenevtchn_fd(_H(xce)); ++ fd = xenevtchn_fd(xce); + if (fd == -1) + caml_failwith("evtchn fd failed"); + +@@ -107,13 +111,14 @@ CAMLprim value stub_eventchn_fd(value xce) + CAMLreturn(result); + } + +-CAMLprim value stub_eventchn_notify(value xce, value port) ++CAMLprim value stub_eventchn_notify(value xce_val, value port) + { +- CAMLparam2(xce, port); ++ CAMLparam2(xce_val, port); ++ xenevtchn_handle *xce = xce_of_val(xce_val); + int rc; + + caml_enter_blocking_section(); +- rc = xenevtchn_notify(_H(xce), Int_val(port)); ++ rc = xenevtchn_notify(xce, Int_val(port)); + caml_leave_blocking_section(); + + if (rc == -1) +@@ -122,15 +127,16 @@ CAMLprim value stub_eventchn_notify(value xce, value port) + CAMLreturn(Val_unit); + } + +-CAMLprim value stub_eventchn_bind_interdomain(value xce, value domid, ++CAMLprim value stub_eventchn_bind_interdomain(value xce_val, value domid, + value remote_port) + { +- CAMLparam3(xce, domid, remote_port); ++ CAMLparam3(xce_val, domid, remote_port); + CAMLlocal1(port); ++ xenevtchn_handle *xce = xce_of_val(xce_val); + xenevtchn_port_or_error_t rc; + + caml_enter_blocking_section(); +- rc = xenevtchn_bind_interdomain(_H(xce), Int_val(domid), Int_val(remote_port)); ++ rc = xenevtchn_bind_interdomain(xce, Int_val(domid), Int_val(remote_port)); + caml_leave_blocking_section(); + + if (rc == -1) +@@ -140,14 +146,15 @@ CAMLprim value stub_eventchn_bind_interdomain(value xce, value domid, + CAMLreturn(port); + } + +-CAMLprim value stub_eventchn_bind_virq(value xce, value virq_type) ++CAMLprim value stub_eventchn_bind_virq(value xce_val, value virq_type) + { +- CAMLparam2(xce, virq_type); ++ CAMLparam2(xce_val, virq_type); + CAMLlocal1(port); ++ xenevtchn_handle *xce = xce_of_val(xce_val); + xenevtchn_port_or_error_t rc; + + caml_enter_blocking_section(); +- rc = xenevtchn_bind_virq(_H(xce), Int_val(virq_type)); ++ rc = xenevtchn_bind_virq(xce, Int_val(virq_type)); + caml_leave_blocking_section(); + + if (rc == -1) +@@ -157,13 +164,14 @@ CAMLprim value stub_eventchn_bind_virq(value xce, value virq_type) + CAMLreturn(port); + } + +-CAMLprim value stub_eventchn_unbind(value xce, value port) ++CAMLprim value stub_eventchn_unbind(value xce_val, value port) + { +- CAMLparam2(xce, port); ++ CAMLparam2(xce_val, port); ++ xenevtchn_handle *xce = xce_of_val(xce_val); + int rc; + + caml_enter_blocking_section(); +- rc = xenevtchn_unbind(_H(xce), Int_val(port)); ++ rc = xenevtchn_unbind(xce, Int_val(port)); + caml_leave_blocking_section(); + + if (rc == -1) +@@ -172,14 +180,15 @@ CAMLprim value stub_eventchn_unbind(value xce, value port) + CAMLreturn(Val_unit); + } + +-CAMLprim value stub_eventchn_pending(value xce) ++CAMLprim value stub_eventchn_pending(value xce_val) + { +- CAMLparam1(xce); ++ CAMLparam1(xce_val); + CAMLlocal1(result); ++ xenevtchn_handle *xce = xce_of_val(xce_val); + xenevtchn_port_or_error_t port; + + caml_enter_blocking_section(); +- port = xenevtchn_pending(_H(xce)); ++ port = xenevtchn_pending(xce); + caml_leave_blocking_section(); + + if (port == -1) +@@ -189,16 +198,17 @@ CAMLprim value stub_eventchn_pending(value xce) + CAMLreturn(result); + } + +-CAMLprim value stub_eventchn_unmask(value xce, value _port) ++CAMLprim value stub_eventchn_unmask(value xce_val, value _port) + { +- CAMLparam2(xce, _port); ++ CAMLparam2(xce_val, _port); ++ xenevtchn_handle *xce = xce_of_val(xce_val); + evtchn_port_t port; + int rc; + + port = Int_val(_port); + + caml_enter_blocking_section(); +- rc = xenevtchn_unmask(_H(xce), port); ++ rc = xenevtchn_unmask(xce, port); + caml_leave_blocking_section(); + + if (rc) +-- +2.40.0 + diff --git a/0018-x86-CPUID-surface-suitable-value-in-EBX-of-XSTATE-su.patch b/0018-x86-CPUID-surface-suitable-value-in-EBX-of-XSTATE-su.patch deleted file mode 100644 index 5fc8919..0000000 --- a/0018-x86-CPUID-surface-suitable-value-in-EBX-of-XSTATE-su.patch +++ /dev/null @@ -1,44 +0,0 @@ -From e8882bcfe35520e950ba60acd6e67e65f1ce90a8 Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Tue, 11 Oct 2022 14:59:26 +0200 -Subject: [PATCH 18/87] x86/CPUID: surface suitable value in EBX of XSTATE - subleaf 1 - -While the SDM isn't very clear about this, our present behavior make -Linux 5.19 unhappy. As of commit 8ad7e8f69695 ("x86/fpu/xsave: Support -XSAVEC in the kernel") they're using this CPUID output also to size -the compacted area used by XSAVEC. Getting back zero there isn't really -liked, yet for PV that's the default on capable hardware: XSAVES isn't -exposed to PV domains. - -Considering that the size reported is that of the compacted save area, -I view Linux'es assumption as appropriate (short of the SDM properly -considering the case). Therefore we need to populate the field also when -only XSAVEC is supported for a guest. - -Fixes: 460b9a4b3630 ("x86/xsaves: enable xsaves/xrstors for hvm guest") -Fixes: 8d050ed1097c ("x86: don't expose XSAVES capability to PV guests") -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> -master commit: c3bd0b83ea5b7c0da6542687436042eeea1e7909 -master date: 2022-08-24 14:23:59 +0200 ---- - xen/arch/x86/cpuid.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/xen/arch/x86/cpuid.c b/xen/arch/x86/cpuid.c -index ff335f16390d..a647331f4793 100644 ---- a/xen/arch/x86/cpuid.c -+++ b/xen/arch/x86/cpuid.c -@@ -1060,7 +1060,7 @@ void guest_cpuid(const struct vcpu *v, uint32_t leaf, - switch ( subleaf ) - { - case 1: -- if ( p->xstate.xsaves ) -+ if ( p->xstate.xsavec || p->xstate.xsaves ) - { - /* - * TODO: Figure out what to do for XSS state. VT-x manages --- -2.37.4 - diff --git a/0019-tools-ocaml-xc-Fix-binding-for-xc_domain_assign_devi.patch b/0019-tools-ocaml-xc-Fix-binding-for-xc_domain_assign_devi.patch new file mode 100644 index 0000000..5571446 --- /dev/null +++ b/0019-tools-ocaml-xc-Fix-binding-for-xc_domain_assign_devi.patch @@ -0,0 +1,70 @@ +From 854013084e2c6267af7787df8b35d85646f79a54 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= <edwin.torok@cloud.com> +Date: Thu, 12 Jan 2023 11:38:38 +0000 +Subject: [PATCH 19/61] tools/ocaml/xc: Fix binding for + xc_domain_assign_device() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The patch adding this binding was plain broken, and unreviewed. It modified +the C stub to add a 4th parameter without an equivalent adjustment in the +Ocaml side of the bindings. + +In 64bit builds, this causes us to dereference whatever dead value is in %rcx +when trying to interpret the rflags parameter. + +This has gone unnoticed because Xapi doesn't use this binding (it has its +own), but unbreak the binding by passing RDM_RELAXED unconditionally for +now (matching the libxl default behaviour). + +Fixes: 9b34056cb4 ("tools: extend xc_assign_device() to support rdm reservation policy") +Signed-off-by: Edwin Török <edwin.torok@cloud.com> +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Acked-by: Christian Lindig <christian.lindig@citrix.com> +(cherry picked from commit 4250683842104f02996428f93927a035c8e19266) +--- + tools/ocaml/libs/xc/xenctrl_stubs.c | 17 +++++------------ + 1 file changed, 5 insertions(+), 12 deletions(-) + +diff --git a/tools/ocaml/libs/xc/xenctrl_stubs.c b/tools/ocaml/libs/xc/xenctrl_stubs.c +index ec64341a9a..e2efcbe182 100644 +--- a/tools/ocaml/libs/xc/xenctrl_stubs.c ++++ b/tools/ocaml/libs/xc/xenctrl_stubs.c +@@ -1123,17 +1123,12 @@ CAMLprim value stub_xc_domain_test_assign_device(value xch, value domid, value d + CAMLreturn(Val_bool(ret == 0)); + } + +-static int domain_assign_device_rdm_flag_table[] = { +- XEN_DOMCTL_DEV_RDM_RELAXED, +-}; +- +-CAMLprim value stub_xc_domain_assign_device(value xch, value domid, value desc, +- value rflag) ++CAMLprim value stub_xc_domain_assign_device(value xch, value domid, value desc) + { +- CAMLparam4(xch, domid, desc, rflag); ++ CAMLparam3(xch, domid, desc); + int ret; + int domain, bus, dev, func; +- uint32_t sbdf, flag; ++ uint32_t sbdf; + + domain = Int_val(Field(desc, 0)); + bus = Int_val(Field(desc, 1)); +@@ -1141,10 +1136,8 @@ CAMLprim value stub_xc_domain_assign_device(value xch, value domid, value desc, + func = Int_val(Field(desc, 3)); + sbdf = encode_sbdf(domain, bus, dev, func); + +- ret = Int_val(Field(rflag, 0)); +- flag = domain_assign_device_rdm_flag_table[ret]; +- +- ret = xc_assign_device(_H(xch), _D(domid), sbdf, flag); ++ ret = xc_assign_device(_H(xch), _D(domid), sbdf, ++ XEN_DOMCTL_DEV_RDM_RELAXED); + + if (ret < 0) + failwith_xc(_H(xch)); +-- +2.40.0 + diff --git a/0019-xen-sched-introduce-cpupool_update_node_affinity.patch b/0019-xen-sched-introduce-cpupool_update_node_affinity.patch deleted file mode 100644 index badb8c3..0000000 --- a/0019-xen-sched-introduce-cpupool_update_node_affinity.patch +++ /dev/null @@ -1,257 +0,0 @@ -From d4e971ad12dd27913dffcf96b5de378ea7b476e1 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 11 Oct 2022 14:59:40 +0200 -Subject: [PATCH 19/87] xen/sched: introduce cpupool_update_node_affinity() - -For updating the node affinities of all domains in a cpupool add a new -function cpupool_update_node_affinity(). - -In order to avoid multiple allocations of cpumasks carve out memory -allocation and freeing from domain_update_node_affinity() into new -helpers, which can be used by cpupool_update_node_affinity(). - -Modify domain_update_node_affinity() to take an additional parameter -for passing the allocated memory in and to allocate and free the memory -via the new helpers in case NULL was passed. - -This will help later to pre-allocate the cpumasks in order to avoid -allocations in stop-machine context. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> -Tested-by: Andrew Cooper <andrew.cooper3@citrix.com> -master commit: a83fa1e2b96ace65b45dde6954d67012633a082b -master date: 2022-09-05 11:42:30 +0100 ---- - xen/common/sched/core.c | 54 ++++++++++++++++++++++++++------------ - xen/common/sched/cpupool.c | 39 +++++++++++++++------------ - xen/common/sched/private.h | 7 +++++ - xen/include/xen/sched.h | 9 ++++++- - 4 files changed, 74 insertions(+), 35 deletions(-) - -diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c -index f07bd2681fcb..065a83eca912 100644 ---- a/xen/common/sched/core.c -+++ b/xen/common/sched/core.c -@@ -1824,9 +1824,28 @@ int vcpu_affinity_domctl(struct domain *d, uint32_t cmd, - return ret; - } - --void domain_update_node_affinity(struct domain *d) -+bool alloc_affinity_masks(struct affinity_masks *affinity) - { -- cpumask_var_t dom_cpumask, dom_cpumask_soft; -+ if ( !alloc_cpumask_var(&affinity->hard) ) -+ return false; -+ if ( !alloc_cpumask_var(&affinity->soft) ) -+ { -+ free_cpumask_var(affinity->hard); -+ return false; -+ } -+ -+ return true; -+} -+ -+void free_affinity_masks(struct affinity_masks *affinity) -+{ -+ free_cpumask_var(affinity->soft); -+ free_cpumask_var(affinity->hard); -+} -+ -+void domain_update_node_aff(struct domain *d, struct affinity_masks *affinity) -+{ -+ struct affinity_masks masks; - cpumask_t *dom_affinity; - const cpumask_t *online; - struct sched_unit *unit; -@@ -1836,14 +1855,16 @@ void domain_update_node_affinity(struct domain *d) - if ( !d->vcpu || !d->vcpu[0] ) - return; - -- if ( !zalloc_cpumask_var(&dom_cpumask) ) -- return; -- if ( !zalloc_cpumask_var(&dom_cpumask_soft) ) -+ if ( !affinity ) - { -- free_cpumask_var(dom_cpumask); -- return; -+ affinity = &masks; -+ if ( !alloc_affinity_masks(affinity) ) -+ return; - } - -+ cpumask_clear(affinity->hard); -+ cpumask_clear(affinity->soft); -+ - online = cpupool_domain_master_cpumask(d); - - spin_lock(&d->node_affinity_lock); -@@ -1864,22 +1885,21 @@ void domain_update_node_affinity(struct domain *d) - */ - for_each_sched_unit ( d, unit ) - { -- cpumask_or(dom_cpumask, dom_cpumask, unit->cpu_hard_affinity); -- cpumask_or(dom_cpumask_soft, dom_cpumask_soft, -- unit->cpu_soft_affinity); -+ cpumask_or(affinity->hard, affinity->hard, unit->cpu_hard_affinity); -+ cpumask_or(affinity->soft, affinity->soft, unit->cpu_soft_affinity); - } - /* Filter out non-online cpus */ -- cpumask_and(dom_cpumask, dom_cpumask, online); -- ASSERT(!cpumask_empty(dom_cpumask)); -+ cpumask_and(affinity->hard, affinity->hard, online); -+ ASSERT(!cpumask_empty(affinity->hard)); - /* And compute the intersection between hard, online and soft */ -- cpumask_and(dom_cpumask_soft, dom_cpumask_soft, dom_cpumask); -+ cpumask_and(affinity->soft, affinity->soft, affinity->hard); - - /* - * If not empty, the intersection of hard, soft and online is the - * narrowest set we want. If empty, we fall back to hard&online. - */ -- dom_affinity = cpumask_empty(dom_cpumask_soft) ? -- dom_cpumask : dom_cpumask_soft; -+ dom_affinity = cpumask_empty(affinity->soft) ? affinity->hard -+ : affinity->soft; - - nodes_clear(d->node_affinity); - for_each_cpu ( cpu, dom_affinity ) -@@ -1888,8 +1908,8 @@ void domain_update_node_affinity(struct domain *d) - - spin_unlock(&d->node_affinity_lock); - -- free_cpumask_var(dom_cpumask_soft); -- free_cpumask_var(dom_cpumask); -+ if ( affinity == &masks ) -+ free_affinity_masks(affinity); - } - - typedef long ret_t; -diff --git a/xen/common/sched/cpupool.c b/xen/common/sched/cpupool.c -index 8c6e6eb9ccd5..45b6ff99561a 100644 ---- a/xen/common/sched/cpupool.c -+++ b/xen/common/sched/cpupool.c -@@ -401,6 +401,25 @@ int cpupool_move_domain(struct domain *d, struct cpupool *c) - return ret; - } - -+/* Update affinities of all domains in a cpupool. */ -+static void cpupool_update_node_affinity(const struct cpupool *c) -+{ -+ struct affinity_masks masks; -+ struct domain *d; -+ -+ if ( !alloc_affinity_masks(&masks) ) -+ return; -+ -+ rcu_read_lock(&domlist_read_lock); -+ -+ for_each_domain_in_cpupool(d, c) -+ domain_update_node_aff(d, &masks); -+ -+ rcu_read_unlock(&domlist_read_lock); -+ -+ free_affinity_masks(&masks); -+} -+ - /* - * assign a specific cpu to a cpupool - * cpupool_lock must be held -@@ -408,7 +427,6 @@ int cpupool_move_domain(struct domain *d, struct cpupool *c) - static int cpupool_assign_cpu_locked(struct cpupool *c, unsigned int cpu) - { - int ret; -- struct domain *d; - const cpumask_t *cpus; - - cpus = sched_get_opt_cpumask(c->gran, cpu); -@@ -433,12 +451,7 @@ static int cpupool_assign_cpu_locked(struct cpupool *c, unsigned int cpu) - - rcu_read_unlock(&sched_res_rculock); - -- rcu_read_lock(&domlist_read_lock); -- for_each_domain_in_cpupool(d, c) -- { -- domain_update_node_affinity(d); -- } -- rcu_read_unlock(&domlist_read_lock); -+ cpupool_update_node_affinity(c); - - return 0; - } -@@ -447,18 +460,14 @@ static int cpupool_unassign_cpu_finish(struct cpupool *c) - { - int cpu = cpupool_moving_cpu; - const cpumask_t *cpus; -- struct domain *d; - int ret; - - if ( c != cpupool_cpu_moving ) - return -EADDRNOTAVAIL; - -- /* -- * We need this for scanning the domain list, both in -- * cpu_disable_scheduler(), and at the bottom of this function. -- */ - rcu_read_lock(&domlist_read_lock); - ret = cpu_disable_scheduler(cpu); -+ rcu_read_unlock(&domlist_read_lock); - - rcu_read_lock(&sched_res_rculock); - cpus = get_sched_res(cpu)->cpus; -@@ -485,11 +494,7 @@ static int cpupool_unassign_cpu_finish(struct cpupool *c) - } - rcu_read_unlock(&sched_res_rculock); - -- for_each_domain_in_cpupool(d, c) -- { -- domain_update_node_affinity(d); -- } -- rcu_read_unlock(&domlist_read_lock); -+ cpupool_update_node_affinity(c); - - return ret; - } -diff --git a/xen/common/sched/private.h b/xen/common/sched/private.h -index a870320146ef..2b04b01a0c0a 100644 ---- a/xen/common/sched/private.h -+++ b/xen/common/sched/private.h -@@ -593,6 +593,13 @@ affinity_balance_cpumask(const struct sched_unit *unit, int step, - cpumask_copy(mask, unit->cpu_hard_affinity); - } - -+struct affinity_masks { -+ cpumask_var_t hard; -+ cpumask_var_t soft; -+}; -+ -+bool alloc_affinity_masks(struct affinity_masks *affinity); -+void free_affinity_masks(struct affinity_masks *affinity); - void sched_rm_cpu(unsigned int cpu); - const cpumask_t *sched_get_opt_cpumask(enum sched_gran opt, unsigned int cpu); - void schedule_dump(struct cpupool *c); -diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h -index 9671062360ac..3f4225738a40 100644 ---- a/xen/include/xen/sched.h -+++ b/xen/include/xen/sched.h -@@ -655,8 +655,15 @@ static inline void get_knownalive_domain(struct domain *d) - ASSERT(!(atomic_read(&d->refcnt) & DOMAIN_DESTROYED)); - } - -+struct affinity_masks; -+ - int domain_set_node_affinity(struct domain *d, const nodemask_t *affinity); --void domain_update_node_affinity(struct domain *d); -+void domain_update_node_aff(struct domain *d, struct affinity_masks *affinity); -+ -+static inline void domain_update_node_affinity(struct domain *d) -+{ -+ domain_update_node_aff(d, NULL); -+} - - /* - * To be implemented by each architecture, sanity checking the configuration --- -2.37.4 - diff --git a/0020-tools-ocaml-xc-Don-t-reference-Abstract_Tag-objects-.patch b/0020-tools-ocaml-xc-Don-t-reference-Abstract_Tag-objects-.patch new file mode 100644 index 0000000..a829d36 --- /dev/null +++ b/0020-tools-ocaml-xc-Don-t-reference-Abstract_Tag-objects-.patch @@ -0,0 +1,76 @@ +From 1fdff77e26290ae1ed40e8253959d12a0c4b3d3f Mon Sep 17 00:00:00 2001 +From: Andrew Cooper <andrew.cooper3@citrix.com> +Date: Tue, 31 Jan 2023 17:19:30 +0000 +Subject: [PATCH 20/61] tools/ocaml/xc: Don't reference Abstract_Tag objects + with the GC lock released + +The intf->{addr,len} references in the xc_map_foreign_range() call are unsafe. +From the manual: + + https://ocaml.org/manual/intfc.html#ss:parallel-execution-long-running-c-code + +"After caml_release_runtime_system() was called and until +caml_acquire_runtime_system() is called, the C code must not access any OCaml +data, nor call any function of the run-time system, nor call back into OCaml +code." + +More than what the manual says, the intf pointer is (potentially) invalidated +by caml_enter_blocking_section() if another thread happens to perform garbage +collection at just the right (wrong) moment. + +Rewrite the logic. There's no need to stash data in the Ocaml object until +the success path at the very end. + +Fixes: 8b7ce06a2d34 ("ocaml: Add XC bindings.") +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Acked-by: Christian Lindig <christian.lindig@citrix.com> +(cherry picked from commit 9e7c74e6f9fd2e44df1212643b80af9032b45b07) +--- + tools/ocaml/libs/xc/xenctrl_stubs.c | 23 +++++++++++------------ + 1 file changed, 11 insertions(+), 12 deletions(-) + +diff --git a/tools/ocaml/libs/xc/xenctrl_stubs.c b/tools/ocaml/libs/xc/xenctrl_stubs.c +index e2efcbe182..0a0fe45c54 100644 +--- a/tools/ocaml/libs/xc/xenctrl_stubs.c ++++ b/tools/ocaml/libs/xc/xenctrl_stubs.c +@@ -937,26 +937,25 @@ CAMLprim value stub_map_foreign_range(value xch, value dom, + CAMLparam4(xch, dom, size, mfn); + CAMLlocal1(result); + struct mmap_interface *intf; +- uint32_t c_dom; +- unsigned long c_mfn; ++ unsigned long c_mfn = Nativeint_val(mfn); ++ int len = Int_val(size); ++ void *ptr; + + BUILD_BUG_ON((sizeof(struct mmap_interface) % sizeof(value)) != 0); + result = caml_alloc(Wsize_bsize(sizeof(struct mmap_interface)), + Abstract_tag); + +- intf = (struct mmap_interface *) result; +- +- intf->len = Int_val(size); +- +- c_dom = _D(dom); +- c_mfn = Nativeint_val(mfn); + caml_enter_blocking_section(); +- intf->addr = xc_map_foreign_range(_H(xch), c_dom, +- intf->len, PROT_READ|PROT_WRITE, +- c_mfn); ++ ptr = xc_map_foreign_range(_H(xch), _D(dom), len, ++ PROT_READ|PROT_WRITE, c_mfn); + caml_leave_blocking_section(); +- if (!intf->addr) ++ ++ if (!ptr) + caml_failwith("xc_map_foreign_range error"); ++ ++ intf = Data_abstract_val(result); ++ *intf = (struct mmap_interface){ ptr, len }; ++ + CAMLreturn(result); + } + +-- +2.40.0 + diff --git a/0020-xen-sched-carve-out-memory-allocation-and-freeing-fr.patch b/0020-xen-sched-carve-out-memory-allocation-and-freeing-fr.patch deleted file mode 100644 index 0a04620..0000000 --- a/0020-xen-sched-carve-out-memory-allocation-and-freeing-fr.patch +++ /dev/null @@ -1,263 +0,0 @@ -From c377ceab0a007690a1e71c81a5232613c99e944d Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 11 Oct 2022 15:00:05 +0200 -Subject: [PATCH 20/87] xen/sched: carve out memory allocation and freeing from - schedule_cpu_rm() - -In order to prepare not allocating or freeing memory from -schedule_cpu_rm(), move this functionality to dedicated functions. - -For now call those functions from schedule_cpu_rm(). - -No change of behavior expected. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> -master commit: d42be6f83480b3ada286dc18444331a816be88a3 -master date: 2022-09-05 11:42:30 +0100 ---- - xen/common/sched/core.c | 143 ++++++++++++++++++++++--------------- - xen/common/sched/private.h | 11 +++ - 2 files changed, 98 insertions(+), 56 deletions(-) - -diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c -index 065a83eca912..2decb1161a63 100644 ---- a/xen/common/sched/core.c -+++ b/xen/common/sched/core.c -@@ -3221,6 +3221,75 @@ out: - return ret; - } - -+/* -+ * Allocate all memory needed for free_cpu_rm_data(), as allocations cannot -+ * be made in stop_machine() context. -+ * -+ * Between alloc_cpu_rm_data() and the real cpu removal action the relevant -+ * contents of struct sched_resource can't change, as the cpu in question is -+ * locked against any other movement to or from cpupools, and the data copied -+ * by alloc_cpu_rm_data() is modified only in case the cpu in question is -+ * being moved from or to a cpupool. -+ */ -+struct cpu_rm_data *alloc_cpu_rm_data(unsigned int cpu) -+{ -+ struct cpu_rm_data *data; -+ const struct sched_resource *sr; -+ unsigned int idx; -+ -+ rcu_read_lock(&sched_res_rculock); -+ -+ sr = get_sched_res(cpu); -+ data = xmalloc_flex_struct(struct cpu_rm_data, sr, sr->granularity - 1); -+ if ( !data ) -+ goto out; -+ -+ data->old_ops = sr->scheduler; -+ data->vpriv_old = idle_vcpu[cpu]->sched_unit->priv; -+ data->ppriv_old = sr->sched_priv; -+ -+ for ( idx = 0; idx < sr->granularity - 1; idx++ ) -+ { -+ data->sr[idx] = sched_alloc_res(); -+ if ( data->sr[idx] ) -+ { -+ data->sr[idx]->sched_unit_idle = sched_alloc_unit_mem(); -+ if ( !data->sr[idx]->sched_unit_idle ) -+ { -+ sched_res_free(&data->sr[idx]->rcu); -+ data->sr[idx] = NULL; -+ } -+ } -+ if ( !data->sr[idx] ) -+ { -+ while ( idx > 0 ) -+ sched_res_free(&data->sr[--idx]->rcu); -+ XFREE(data); -+ goto out; -+ } -+ -+ data->sr[idx]->curr = data->sr[idx]->sched_unit_idle; -+ data->sr[idx]->scheduler = &sched_idle_ops; -+ data->sr[idx]->granularity = 1; -+ -+ /* We want the lock not to change when replacing the resource. */ -+ data->sr[idx]->schedule_lock = sr->schedule_lock; -+ } -+ -+ out: -+ rcu_read_unlock(&sched_res_rculock); -+ -+ return data; -+} -+ -+void free_cpu_rm_data(struct cpu_rm_data *mem, unsigned int cpu) -+{ -+ sched_free_udata(mem->old_ops, mem->vpriv_old); -+ sched_free_pdata(mem->old_ops, mem->ppriv_old, cpu); -+ -+ xfree(mem); -+} -+ - /* - * Remove a pCPU from its cpupool. Its scheduler becomes &sched_idle_ops - * (the idle scheduler). -@@ -3229,53 +3298,23 @@ out: - */ - int schedule_cpu_rm(unsigned int cpu) - { -- void *ppriv_old, *vpriv_old; -- struct sched_resource *sr, **sr_new = NULL; -+ struct sched_resource *sr; -+ struct cpu_rm_data *data; - struct sched_unit *unit; -- struct scheduler *old_ops; - spinlock_t *old_lock; - unsigned long flags; -- int idx, ret = -ENOMEM; -+ int idx = 0; - unsigned int cpu_iter; - -+ data = alloc_cpu_rm_data(cpu); -+ if ( !data ) -+ return -ENOMEM; -+ - rcu_read_lock(&sched_res_rculock); - - sr = get_sched_res(cpu); -- old_ops = sr->scheduler; - -- if ( sr->granularity > 1 ) -- { -- sr_new = xmalloc_array(struct sched_resource *, sr->granularity - 1); -- if ( !sr_new ) -- goto out; -- for ( idx = 0; idx < sr->granularity - 1; idx++ ) -- { -- sr_new[idx] = sched_alloc_res(); -- if ( sr_new[idx] ) -- { -- sr_new[idx]->sched_unit_idle = sched_alloc_unit_mem(); -- if ( !sr_new[idx]->sched_unit_idle ) -- { -- sched_res_free(&sr_new[idx]->rcu); -- sr_new[idx] = NULL; -- } -- } -- if ( !sr_new[idx] ) -- { -- for ( idx--; idx >= 0; idx-- ) -- sched_res_free(&sr_new[idx]->rcu); -- goto out; -- } -- sr_new[idx]->curr = sr_new[idx]->sched_unit_idle; -- sr_new[idx]->scheduler = &sched_idle_ops; -- sr_new[idx]->granularity = 1; -- -- /* We want the lock not to change when replacing the resource. */ -- sr_new[idx]->schedule_lock = sr->schedule_lock; -- } -- } -- -- ret = 0; -+ ASSERT(sr->granularity); - ASSERT(sr->cpupool != NULL); - ASSERT(cpumask_test_cpu(cpu, &cpupool_free_cpus)); - ASSERT(!cpumask_test_cpu(cpu, sr->cpupool->cpu_valid)); -@@ -3283,10 +3322,6 @@ int schedule_cpu_rm(unsigned int cpu) - /* See comment in schedule_cpu_add() regarding lock switching. */ - old_lock = pcpu_schedule_lock_irqsave(cpu, &flags); - -- vpriv_old = idle_vcpu[cpu]->sched_unit->priv; -- ppriv_old = sr->sched_priv; -- -- idx = 0; - for_each_cpu ( cpu_iter, sr->cpus ) - { - per_cpu(sched_res_idx, cpu_iter) = 0; -@@ -3300,27 +3335,27 @@ int schedule_cpu_rm(unsigned int cpu) - else - { - /* Initialize unit. */ -- unit = sr_new[idx]->sched_unit_idle; -- unit->res = sr_new[idx]; -+ unit = data->sr[idx]->sched_unit_idle; -+ unit->res = data->sr[idx]; - unit->is_running = true; - sched_unit_add_vcpu(unit, idle_vcpu[cpu_iter]); - sched_domain_insert_unit(unit, idle_vcpu[cpu_iter]->domain); - - /* Adjust cpu masks of resources (old and new). */ - cpumask_clear_cpu(cpu_iter, sr->cpus); -- cpumask_set_cpu(cpu_iter, sr_new[idx]->cpus); -+ cpumask_set_cpu(cpu_iter, data->sr[idx]->cpus); - cpumask_set_cpu(cpu_iter, &sched_res_mask); - - /* Init timer. */ -- init_timer(&sr_new[idx]->s_timer, s_timer_fn, NULL, cpu_iter); -+ init_timer(&data->sr[idx]->s_timer, s_timer_fn, NULL, cpu_iter); - - /* Last resource initializations and insert resource pointer. */ -- sr_new[idx]->master_cpu = cpu_iter; -- set_sched_res(cpu_iter, sr_new[idx]); -+ data->sr[idx]->master_cpu = cpu_iter; -+ set_sched_res(cpu_iter, data->sr[idx]); - - /* Last action: set the new lock pointer. */ - smp_mb(); -- sr_new[idx]->schedule_lock = &sched_free_cpu_lock; -+ data->sr[idx]->schedule_lock = &sched_free_cpu_lock; - - idx++; - } -@@ -3336,16 +3371,12 @@ int schedule_cpu_rm(unsigned int cpu) - /* _Not_ pcpu_schedule_unlock(): schedule_lock may have changed! */ - spin_unlock_irqrestore(old_lock, flags); - -- sched_deinit_pdata(old_ops, ppriv_old, cpu); -+ sched_deinit_pdata(data->old_ops, data->ppriv_old, cpu); - -- sched_free_udata(old_ops, vpriv_old); -- sched_free_pdata(old_ops, ppriv_old, cpu); -- --out: - rcu_read_unlock(&sched_res_rculock); -- xfree(sr_new); -+ free_cpu_rm_data(data, cpu); - -- return ret; -+ return 0; - } - - struct scheduler *scheduler_get_default(void) -diff --git a/xen/common/sched/private.h b/xen/common/sched/private.h -index 2b04b01a0c0a..e286849a1312 100644 ---- a/xen/common/sched/private.h -+++ b/xen/common/sched/private.h -@@ -600,6 +600,15 @@ struct affinity_masks { - - bool alloc_affinity_masks(struct affinity_masks *affinity); - void free_affinity_masks(struct affinity_masks *affinity); -+ -+/* Memory allocation related data for schedule_cpu_rm(). */ -+struct cpu_rm_data { -+ const struct scheduler *old_ops; -+ void *ppriv_old; -+ void *vpriv_old; -+ struct sched_resource *sr[]; -+}; -+ - void sched_rm_cpu(unsigned int cpu); - const cpumask_t *sched_get_opt_cpumask(enum sched_gran opt, unsigned int cpu); - void schedule_dump(struct cpupool *c); -@@ -608,6 +617,8 @@ struct scheduler *scheduler_alloc(unsigned int sched_id); - void scheduler_free(struct scheduler *sched); - int cpu_disable_scheduler(unsigned int cpu); - int schedule_cpu_add(unsigned int cpu, struct cpupool *c); -+struct cpu_rm_data *alloc_cpu_rm_data(unsigned int cpu); -+void free_cpu_rm_data(struct cpu_rm_data *mem, unsigned int cpu); - int schedule_cpu_rm(unsigned int cpu); - int sched_move_domain(struct domain *d, struct cpupool *c); - struct cpupool *cpupool_get_by_id(unsigned int poolid); --- -2.37.4 - diff --git a/0021-tools-ocaml-libs-Fix-memory-resource-leaks-with-caml.patch b/0021-tools-ocaml-libs-Fix-memory-resource-leaks-with-caml.patch new file mode 100644 index 0000000..8ed7dfa --- /dev/null +++ b/0021-tools-ocaml-libs-Fix-memory-resource-leaks-with-caml.patch @@ -0,0 +1,61 @@ +From 1b6acdeeb2323c53d841356da50440e274e7bf9a Mon Sep 17 00:00:00 2001 +From: Andrew Cooper <andrew.cooper3@citrix.com> +Date: Wed, 1 Feb 2023 11:27:42 +0000 +Subject: [PATCH 21/61] tools/ocaml/libs: Fix memory/resource leaks with + caml_alloc_custom() + +All caml_alloc_*() functions can throw exceptions, and longjump out of +context. If this happens, we leak the xch/xce handle. + +Reorder the logic to allocate the the Ocaml object first. + +Fixes: 8b3c06a3e545 ("tools/ocaml/xenctrl: OCaml 5 support, fix use-after-free") +Fixes: 22d5affdf0ce ("tools/ocaml/evtchn: OCaml 5 support, fix potential resource leak") +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Acked-by: Christian Lindig <christian.lindig@citrix.com> +(cherry picked from commit d69ccf52ad467ccc22029172a8e61dc621187889) +--- + tools/ocaml/libs/eventchn/xeneventchn_stubs.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/tools/ocaml/libs/eventchn/xeneventchn_stubs.c b/tools/ocaml/libs/eventchn/xeneventchn_stubs.c +index d7881ca95f..de2fc29292 100644 +--- a/tools/ocaml/libs/eventchn/xeneventchn_stubs.c ++++ b/tools/ocaml/libs/eventchn/xeneventchn_stubs.c +@@ -63,6 +63,8 @@ CAMLprim value stub_eventchn_init(value cloexec) + if ( !Bool_val(cloexec) ) + flags |= XENEVTCHN_NO_CLOEXEC; + ++ result = caml_alloc_custom(&xenevtchn_ops, sizeof(xce), 0, 1); ++ + caml_enter_blocking_section(); + xce = xenevtchn_open(NULL, flags); + caml_leave_blocking_section(); +@@ -70,7 +72,6 @@ CAMLprim value stub_eventchn_init(value cloexec) + if (xce == NULL) + caml_failwith("open failed"); + +- result = caml_alloc_custom(&xenevtchn_ops, sizeof(xce), 0, 1); + *(xenevtchn_handle **)Data_custom_val(result) = xce; + + CAMLreturn(result); +@@ -82,6 +83,8 @@ CAMLprim value stub_eventchn_fdopen(value fdval) + CAMLlocal1(result); + xenevtchn_handle *xce; + ++ result = caml_alloc_custom(&xenevtchn_ops, sizeof(xce), 0, 1); ++ + caml_enter_blocking_section(); + xce = xenevtchn_fdopen(NULL, Int_val(fdval), 0); + caml_leave_blocking_section(); +@@ -89,7 +92,6 @@ CAMLprim value stub_eventchn_fdopen(value fdval) + if (xce == NULL) + caml_failwith("evtchn fdopen failed"); + +- result = caml_alloc_custom(&xenevtchn_ops, sizeof(xce), 0, 1); + *(xenevtchn_handle **)Data_custom_val(result) = xce; + + CAMLreturn(result); +-- +2.40.0 + diff --git a/0021-xen-sched-fix-cpu-hotplug.patch b/0021-xen-sched-fix-cpu-hotplug.patch deleted file mode 100644 index ac3b1d7..0000000 --- a/0021-xen-sched-fix-cpu-hotplug.patch +++ /dev/null @@ -1,307 +0,0 @@ -From 4f3204c2bc66db18c61600dd3e08bf1fd9584a1b Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 11 Oct 2022 15:00:19 +0200 -Subject: [PATCH 21/87] xen/sched: fix cpu hotplug - -Cpu unplugging is calling schedule_cpu_rm() via stop_machine_run() with -interrupts disabled, thus any memory allocation or freeing must be -avoided. - -Since commit 5047cd1d5dea ("xen/common: Use enhanced -ASSERT_ALLOC_CONTEXT in xmalloc()") this restriction is being enforced -via an assertion, which will now fail. - -Fix this by allocating needed memory before entering stop_machine_run() -and freeing any memory only after having finished stop_machine_run(). - -Fixes: 1ec410112cdd ("xen/sched: support differing granularity in schedule_cpu_[add/rm]()") -Reported-by: Gao Ruifeng <ruifeng.gao@intel.com> -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> -Tested-by: Andrew Cooper <andrew.cooper3@citrix.com> -master commit: d84473689611eed32fd90b27e614f28af767fa3f -master date: 2022-09-05 11:42:30 +0100 ---- - xen/common/sched/core.c | 25 +++++++++++--- - xen/common/sched/cpupool.c | 69 +++++++++++++++++++++++++++++--------- - xen/common/sched/private.h | 5 +-- - 3 files changed, 77 insertions(+), 22 deletions(-) - -diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c -index 2decb1161a63..900aab8f66a7 100644 ---- a/xen/common/sched/core.c -+++ b/xen/common/sched/core.c -@@ -3231,7 +3231,7 @@ out: - * by alloc_cpu_rm_data() is modified only in case the cpu in question is - * being moved from or to a cpupool. - */ --struct cpu_rm_data *alloc_cpu_rm_data(unsigned int cpu) -+struct cpu_rm_data *alloc_cpu_rm_data(unsigned int cpu, bool aff_alloc) - { - struct cpu_rm_data *data; - const struct sched_resource *sr; -@@ -3244,6 +3244,17 @@ struct cpu_rm_data *alloc_cpu_rm_data(unsigned int cpu) - if ( !data ) - goto out; - -+ if ( aff_alloc ) -+ { -+ if ( !alloc_affinity_masks(&data->affinity) ) -+ { -+ XFREE(data); -+ goto out; -+ } -+ } -+ else -+ memset(&data->affinity, 0, sizeof(data->affinity)); -+ - data->old_ops = sr->scheduler; - data->vpriv_old = idle_vcpu[cpu]->sched_unit->priv; - data->ppriv_old = sr->sched_priv; -@@ -3264,6 +3275,7 @@ struct cpu_rm_data *alloc_cpu_rm_data(unsigned int cpu) - { - while ( idx > 0 ) - sched_res_free(&data->sr[--idx]->rcu); -+ free_affinity_masks(&data->affinity); - XFREE(data); - goto out; - } -@@ -3286,6 +3298,7 @@ void free_cpu_rm_data(struct cpu_rm_data *mem, unsigned int cpu) - { - sched_free_udata(mem->old_ops, mem->vpriv_old); - sched_free_pdata(mem->old_ops, mem->ppriv_old, cpu); -+ free_affinity_masks(&mem->affinity); - - xfree(mem); - } -@@ -3296,17 +3309,18 @@ void free_cpu_rm_data(struct cpu_rm_data *mem, unsigned int cpu) - * The cpu is already marked as "free" and not valid any longer for its - * cpupool. - */ --int schedule_cpu_rm(unsigned int cpu) -+int schedule_cpu_rm(unsigned int cpu, struct cpu_rm_data *data) - { - struct sched_resource *sr; -- struct cpu_rm_data *data; - struct sched_unit *unit; - spinlock_t *old_lock; - unsigned long flags; - int idx = 0; - unsigned int cpu_iter; -+ bool free_data = !data; - -- data = alloc_cpu_rm_data(cpu); -+ if ( !data ) -+ data = alloc_cpu_rm_data(cpu, false); - if ( !data ) - return -ENOMEM; - -@@ -3374,7 +3388,8 @@ int schedule_cpu_rm(unsigned int cpu) - sched_deinit_pdata(data->old_ops, data->ppriv_old, cpu); - - rcu_read_unlock(&sched_res_rculock); -- free_cpu_rm_data(data, cpu); -+ if ( free_data ) -+ free_cpu_rm_data(data, cpu); - - return 0; - } -diff --git a/xen/common/sched/cpupool.c b/xen/common/sched/cpupool.c -index 45b6ff99561a..b5a948639aad 100644 ---- a/xen/common/sched/cpupool.c -+++ b/xen/common/sched/cpupool.c -@@ -402,22 +402,28 @@ int cpupool_move_domain(struct domain *d, struct cpupool *c) - } - - /* Update affinities of all domains in a cpupool. */ --static void cpupool_update_node_affinity(const struct cpupool *c) -+static void cpupool_update_node_affinity(const struct cpupool *c, -+ struct affinity_masks *masks) - { -- struct affinity_masks masks; -+ struct affinity_masks local_masks; - struct domain *d; - -- if ( !alloc_affinity_masks(&masks) ) -- return; -+ if ( !masks ) -+ { -+ if ( !alloc_affinity_masks(&local_masks) ) -+ return; -+ masks = &local_masks; -+ } - - rcu_read_lock(&domlist_read_lock); - - for_each_domain_in_cpupool(d, c) -- domain_update_node_aff(d, &masks); -+ domain_update_node_aff(d, masks); - - rcu_read_unlock(&domlist_read_lock); - -- free_affinity_masks(&masks); -+ if ( masks == &local_masks ) -+ free_affinity_masks(masks); - } - - /* -@@ -451,15 +457,17 @@ static int cpupool_assign_cpu_locked(struct cpupool *c, unsigned int cpu) - - rcu_read_unlock(&sched_res_rculock); - -- cpupool_update_node_affinity(c); -+ cpupool_update_node_affinity(c, NULL); - - return 0; - } - --static int cpupool_unassign_cpu_finish(struct cpupool *c) -+static int cpupool_unassign_cpu_finish(struct cpupool *c, -+ struct cpu_rm_data *mem) - { - int cpu = cpupool_moving_cpu; - const cpumask_t *cpus; -+ struct affinity_masks *masks = mem ? &mem->affinity : NULL; - int ret; - - if ( c != cpupool_cpu_moving ) -@@ -482,7 +490,7 @@ static int cpupool_unassign_cpu_finish(struct cpupool *c) - */ - if ( !ret ) - { -- ret = schedule_cpu_rm(cpu); -+ ret = schedule_cpu_rm(cpu, mem); - if ( ret ) - cpumask_andnot(&cpupool_free_cpus, &cpupool_free_cpus, cpus); - else -@@ -494,7 +502,7 @@ static int cpupool_unassign_cpu_finish(struct cpupool *c) - } - rcu_read_unlock(&sched_res_rculock); - -- cpupool_update_node_affinity(c); -+ cpupool_update_node_affinity(c, masks); - - return ret; - } -@@ -558,7 +566,7 @@ static long cpupool_unassign_cpu_helper(void *info) - cpupool_cpu_moving->cpupool_id, cpupool_moving_cpu); - spin_lock(&cpupool_lock); - -- ret = cpupool_unassign_cpu_finish(c); -+ ret = cpupool_unassign_cpu_finish(c, NULL); - - spin_unlock(&cpupool_lock); - debugtrace_printk("cpupool_unassign_cpu ret=%ld\n", ret); -@@ -701,7 +709,7 @@ static int cpupool_cpu_add(unsigned int cpu) - * This function is called in stop_machine context, so we can be sure no - * non-idle vcpu is active on the system. - */ --static void cpupool_cpu_remove(unsigned int cpu) -+static void cpupool_cpu_remove(unsigned int cpu, struct cpu_rm_data *mem) - { - int ret; - -@@ -709,7 +717,7 @@ static void cpupool_cpu_remove(unsigned int cpu) - - if ( !cpumask_test_cpu(cpu, &cpupool_free_cpus) ) - { -- ret = cpupool_unassign_cpu_finish(cpupool0); -+ ret = cpupool_unassign_cpu_finish(cpupool0, mem); - BUG_ON(ret); - } - cpumask_clear_cpu(cpu, &cpupool_free_cpus); -@@ -775,7 +783,7 @@ static void cpupool_cpu_remove_forced(unsigned int cpu) - { - ret = cpupool_unassign_cpu_start(c, master_cpu); - BUG_ON(ret); -- ret = cpupool_unassign_cpu_finish(c); -+ ret = cpupool_unassign_cpu_finish(c, NULL); - BUG_ON(ret); - } - } -@@ -993,12 +1001,24 @@ void dump_runq(unsigned char key) - static int cpu_callback( - struct notifier_block *nfb, unsigned long action, void *hcpu) - { -+ static struct cpu_rm_data *mem; -+ - unsigned int cpu = (unsigned long)hcpu; - int rc = 0; - - switch ( action ) - { - case CPU_DOWN_FAILED: -+ if ( system_state <= SYS_STATE_active ) -+ { -+ if ( mem ) -+ { -+ free_cpu_rm_data(mem, cpu); -+ mem = NULL; -+ } -+ rc = cpupool_cpu_add(cpu); -+ } -+ break; - case CPU_ONLINE: - if ( system_state <= SYS_STATE_active ) - rc = cpupool_cpu_add(cpu); -@@ -1006,12 +1026,31 @@ static int cpu_callback( - case CPU_DOWN_PREPARE: - /* Suspend/Resume don't change assignments of cpus to cpupools. */ - if ( system_state <= SYS_STATE_active ) -+ { - rc = cpupool_cpu_remove_prologue(cpu); -+ if ( !rc ) -+ { -+ ASSERT(!mem); -+ mem = alloc_cpu_rm_data(cpu, true); -+ rc = mem ? 0 : -ENOMEM; -+ } -+ } - break; - case CPU_DYING: - /* Suspend/Resume don't change assignments of cpus to cpupools. */ - if ( system_state <= SYS_STATE_active ) -- cpupool_cpu_remove(cpu); -+ { -+ ASSERT(mem); -+ cpupool_cpu_remove(cpu, mem); -+ } -+ break; -+ case CPU_DEAD: -+ if ( system_state <= SYS_STATE_active ) -+ { -+ ASSERT(mem); -+ free_cpu_rm_data(mem, cpu); -+ mem = NULL; -+ } - break; - case CPU_RESUME_FAILED: - cpupool_cpu_remove_forced(cpu); -diff --git a/xen/common/sched/private.h b/xen/common/sched/private.h -index e286849a1312..0126a4bb9ed3 100644 ---- a/xen/common/sched/private.h -+++ b/xen/common/sched/private.h -@@ -603,6 +603,7 @@ void free_affinity_masks(struct affinity_masks *affinity); - - /* Memory allocation related data for schedule_cpu_rm(). */ - struct cpu_rm_data { -+ struct affinity_masks affinity; - const struct scheduler *old_ops; - void *ppriv_old; - void *vpriv_old; -@@ -617,9 +618,9 @@ struct scheduler *scheduler_alloc(unsigned int sched_id); - void scheduler_free(struct scheduler *sched); - int cpu_disable_scheduler(unsigned int cpu); - int schedule_cpu_add(unsigned int cpu, struct cpupool *c); --struct cpu_rm_data *alloc_cpu_rm_data(unsigned int cpu); -+struct cpu_rm_data *alloc_cpu_rm_data(unsigned int cpu, bool aff_alloc); - void free_cpu_rm_data(struct cpu_rm_data *mem, unsigned int cpu); --int schedule_cpu_rm(unsigned int cpu); -+int schedule_cpu_rm(unsigned int cpu, struct cpu_rm_data *mem); - int sched_move_domain(struct domain *d, struct cpupool *c); - struct cpupool *cpupool_get_by_id(unsigned int poolid); - void cpupool_put(struct cpupool *pool); --- -2.37.4 - diff --git a/0022-Config.mk-correct-PIE-related-option-s-in-EMBEDDED_E.patch b/0022-Config.mk-correct-PIE-related-option-s-in-EMBEDDED_E.patch deleted file mode 100644 index 5432b3c..0000000 --- a/0022-Config.mk-correct-PIE-related-option-s-in-EMBEDDED_E.patch +++ /dev/null @@ -1,58 +0,0 @@ -From 2b694dd2932be78431b14257f23b738f2fc8f6a1 Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Tue, 11 Oct 2022 15:00:33 +0200 -Subject: [PATCH 22/87] Config.mk: correct PIE-related option(s) in - EMBEDDED_EXTRA_CFLAGS - -I haven't been able to find evidence of "-nopie" ever having been a -supported compiler option. The correct spelling is "-no-pie". -Furthermore like "-pie" this is an option which is solely passed to the -linker. The compiler only recognizes "-fpie" / "-fPIE" / "-fno-pie", and -it doesn't infer these options from "-pie" / "-no-pie". - -Add the compiler recognized form, but for the possible case of the -variable also being used somewhere for linking keep the linker option as -well (with corrected spelling). - -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Julien Grall <jgrall@amazon.com> - -Build: Drop -no-pie from EMBEDDED_EXTRA_CFLAGS - -This breaks all Clang builds, as demostrated by Gitlab CI. - -Contrary to the description in ecd6b9759919, -no-pie is not even an option -passed to the linker. GCC's actual behaviour is to inhibit the passing of --pie to the linker, as well as selecting different cr0 artefacts to be linked. - -EMBEDDED_EXTRA_CFLAGS is not used for $(CC)-doing-linking, and not liable to -gain such a usecase. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Acked-by: Jan Beulich <jbeulich@suse.com> -Tested-by: Stefano Stabellini <sstabellini@kernel.org> -Fixes: ecd6b9759919 ("Config.mk: correct PIE-related option(s) in EMBEDDED_EXTRA_CFLAGS") -master commit: ecd6b9759919fa6335b0be1b5fc5cce29a30c4f1 -master date: 2022-09-08 09:25:26 +0200 -master commit: 13a7c0074ac8fb31f6c0485429b7a20a1946cb22 -master date: 2022-09-27 15:40:42 -0700 ---- - Config.mk | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/Config.mk b/Config.mk -index 46de3cd1e0e1..6f95067b8de6 100644 ---- a/Config.mk -+++ b/Config.mk -@@ -197,7 +197,7 @@ endif - APPEND_LDFLAGS += $(foreach i, $(APPEND_LIB), -L$(i)) - APPEND_CFLAGS += $(foreach i, $(APPEND_INCLUDES), -I$(i)) - --EMBEDDED_EXTRA_CFLAGS := -nopie -fno-stack-protector -fno-stack-protector-all -+EMBEDDED_EXTRA_CFLAGS := -fno-pie -fno-stack-protector -fno-stack-protector-all - EMBEDDED_EXTRA_CFLAGS += -fno-exceptions -fno-asynchronous-unwind-tables - - XEN_EXTFILES_URL ?= http://xenbits.xen.org/xen-extfiles --- -2.37.4 - diff --git a/0022-x86-spec-ctrl-Mitigate-Cross-Thread-Return-Address-P.patch b/0022-x86-spec-ctrl-Mitigate-Cross-Thread-Return-Address-P.patch new file mode 100644 index 0000000..1d1edb0 --- /dev/null +++ b/0022-x86-spec-ctrl-Mitigate-Cross-Thread-Return-Address-P.patch @@ -0,0 +1,120 @@ +From d4e286db89d80c862b4a24bf971dd71008c8b53e Mon Sep 17 00:00:00 2001 +From: Andrew Cooper <andrew.cooper3@citrix.com> +Date: Thu, 8 Sep 2022 21:27:58 +0100 +Subject: [PATCH 22/61] x86/spec-ctrl: Mitigate Cross-Thread Return Address + Predictions + +This is XSA-426 / CVE-2022-27672 + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +(cherry picked from commit 63305e5392ec2d17b85e7996a97462744425db80) +--- + docs/misc/xen-command-line.pandoc | 2 +- + xen/arch/x86/spec_ctrl.c | 31 ++++++++++++++++++++++++++++--- + xen/include/asm-x86/cpufeatures.h | 3 ++- + xen/include/asm-x86/spec_ctrl.h | 15 +++++++++++++++ + 4 files changed, 46 insertions(+), 5 deletions(-) + +diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc +index bd6826d0ae..b3f60cd923 100644 +--- a/docs/misc/xen-command-line.pandoc ++++ b/docs/misc/xen-command-line.pandoc +@@ -2275,7 +2275,7 @@ guests to use. + on entry and exit. These blocks are necessary to virtualise support for + guests and if disabled, guests will be unable to use IBRS/STIBP/SSBD/etc. + * `rsb=` offers control over whether to overwrite the Return Stack Buffer / +- Return Address Stack on entry to Xen. ++ Return Address Stack on entry to Xen and on idle. + * `md-clear=` offers control over whether to use VERW to flush + microarchitectural buffers on idle and exit from Xen. *Note: For + compatibility with development versions of this fix, `mds=` is also accepted +diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c +index 90d86fe5cb..14649d92f5 100644 +--- a/xen/arch/x86/spec_ctrl.c ++++ b/xen/arch/x86/spec_ctrl.c +@@ -1317,13 +1317,38 @@ void __init init_speculation_mitigations(void) + * 3) Some CPUs have RSBs which are not full width, which allow the + * attacker's entries to alias Xen addresses. + * ++ * 4) Some CPUs have RSBs which are re-partitioned based on thread ++ * idleness, which allows an attacker to inject entries into the other ++ * thread. We still active the optimisation in this case, and mitigate ++ * in the idle path which has lower overhead. ++ * + * It is safe to turn off RSB stuffing when Xen is using SMEP itself, and + * 32bit PV guests are disabled, and when the RSB is full width. + */ + BUILD_BUG_ON(RO_MPT_VIRT_START != PML4_ADDR(256)); +- if ( opt_rsb_pv == -1 && boot_cpu_has(X86_FEATURE_XEN_SMEP) && +- !opt_pv32 && rsb_is_full_width() ) +- opt_rsb_pv = 0; ++ if ( opt_rsb_pv == -1 ) ++ { ++ opt_rsb_pv = (opt_pv32 || !boot_cpu_has(X86_FEATURE_XEN_SMEP) || ++ !rsb_is_full_width()); ++ ++ /* ++ * Cross-Thread Return Address Predictions. ++ * ++ * Vulnerable systems are Zen1/Zen2 uarch, which is AMD Fam17 / Hygon ++ * Fam18, when SMT is active. ++ * ++ * To mitigate, we must flush the RSB/RAS/RAP once between entering ++ * Xen and going idle. ++ * ++ * Most cases flush on entry to Xen anyway. The one case where we ++ * don't is when using the SMEP optimisation for PV guests. Flushing ++ * before going idle is less overhead than flushing on PV entry. ++ */ ++ if ( !opt_rsb_pv && hw_smt_enabled && ++ (boot_cpu_data.x86_vendor & (X86_VENDOR_AMD|X86_VENDOR_HYGON)) && ++ (boot_cpu_data.x86 == 0x17 || boot_cpu_data.x86 == 0x18) ) ++ setup_force_cpu_cap(X86_FEATURE_SC_RSB_IDLE); ++ } + + if ( opt_rsb_pv ) + { +diff --git a/xen/include/asm-x86/cpufeatures.h b/xen/include/asm-x86/cpufeatures.h +index ecc1bb0950..ccf9d7287c 100644 +--- a/xen/include/asm-x86/cpufeatures.h ++++ b/xen/include/asm-x86/cpufeatures.h +@@ -35,7 +35,8 @@ XEN_CPUFEATURE(SC_RSB_HVM, X86_SYNTH(19)) /* RSB overwrite needed for HVM + XEN_CPUFEATURE(XEN_SELFSNOOP, X86_SYNTH(20)) /* SELFSNOOP gets used by Xen itself */ + XEN_CPUFEATURE(SC_MSR_IDLE, X86_SYNTH(21)) /* Clear MSR_SPEC_CTRL on idle */ + XEN_CPUFEATURE(XEN_LBR, X86_SYNTH(22)) /* Xen uses MSR_DEBUGCTL.LBR */ +-/* Bits 23,24 unused. */ ++/* Bits 23 unused. */ ++XEN_CPUFEATURE(SC_RSB_IDLE, X86_SYNTH(24)) /* RSB overwrite needed for idle. */ + XEN_CPUFEATURE(SC_VERW_IDLE, X86_SYNTH(25)) /* VERW used by Xen for idle */ + XEN_CPUFEATURE(XEN_SHSTK, X86_SYNTH(26)) /* Xen uses CET Shadow Stacks */ + XEN_CPUFEATURE(XEN_IBT, X86_SYNTH(27)) /* Xen uses CET Indirect Branch Tracking */ +diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h +index 6a77c39378..391973ef6a 100644 +--- a/xen/include/asm-x86/spec_ctrl.h ++++ b/xen/include/asm-x86/spec_ctrl.h +@@ -159,6 +159,21 @@ static always_inline void spec_ctrl_enter_idle(struct cpu_info *info) + */ + alternative_input("", "verw %[sel]", X86_FEATURE_SC_VERW_IDLE, + [sel] "m" (info->verw_sel)); ++ ++ /* ++ * Cross-Thread Return Address Predictions: ++ * ++ * On vulnerable systems, the return predictions (RSB/RAS) are statically ++ * partitioned between active threads. When entering idle, our entries ++ * are re-partitioned to allow the other threads to use them. ++ * ++ * In some cases, we might still have guest entries in the RAS, so flush ++ * them before injecting them sideways to our sibling thread. ++ * ++ * (ab)use alternative_input() to specify clobbers. ++ */ ++ alternative_input("", "DO_OVERWRITE_RSB", X86_FEATURE_SC_RSB_IDLE, ++ : "rax", "rcx"); + } + + /* WARNING! `ret`, `call *`, `jmp *` not safe before this call. */ +-- +2.40.0 + diff --git a/0023-automation-Remove-clang-8-from-Debian-unstable-conta.patch b/0023-automation-Remove-clang-8-from-Debian-unstable-conta.patch new file mode 100644 index 0000000..36dfb4f --- /dev/null +++ b/0023-automation-Remove-clang-8-from-Debian-unstable-conta.patch @@ -0,0 +1,84 @@ +From 0802504627453a54b1ab408b6e9dc8b5c561172d Mon Sep 17 00:00:00 2001 +From: Anthony PERARD <anthony.perard@citrix.com> +Date: Tue, 21 Feb 2023 16:55:38 +0000 +Subject: [PATCH 23/61] automation: Remove clang-8 from Debian unstable + container + +First, apt complain that it isn't the right way to add keys anymore, +but hopefully that's just a warning. + +Second, we can't install clang-8: +The following packages have unmet dependencies: + clang-8 : Depends: libstdc++-8-dev but it is not installable + Depends: libgcc-8-dev but it is not installable + Depends: libobjc-8-dev but it is not installable + Recommends: llvm-8-dev but it is not going to be installed + Recommends: libomp-8-dev but it is not going to be installed + libllvm8 : Depends: libffi7 (>= 3.3~20180313) but it is not installable +E: Unable to correct problems, you have held broken packages. + +clang on Debian unstable is now version 14.0.6. + +Signed-off-by: Anthony PERARD <anthony.perard@citrix.com> +Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> +(cherry picked from commit a6b1e2b80fe2053b1c9c9843fb086a668513ea36) +--- + automation/build/debian/unstable-llvm-8.list | 3 --- + automation/build/debian/unstable.dockerfile | 12 ------------ + automation/gitlab-ci/build.yaml | 10 ---------- + 3 files changed, 25 deletions(-) + delete mode 100644 automation/build/debian/unstable-llvm-8.list + +diff --git a/automation/build/debian/unstable-llvm-8.list b/automation/build/debian/unstable-llvm-8.list +deleted file mode 100644 +index dc119fa0b4..0000000000 +--- a/automation/build/debian/unstable-llvm-8.list ++++ /dev/null +@@ -1,3 +0,0 @@ +-# Unstable LLVM 8 repos +-deb http://apt.llvm.org/unstable/ llvm-toolchain-8 main +-deb-src http://apt.llvm.org/unstable/ llvm-toolchain-8 main +diff --git a/automation/build/debian/unstable.dockerfile b/automation/build/debian/unstable.dockerfile +index bd61cd12c2..828afa2e1e 100644 +--- a/automation/build/debian/unstable.dockerfile ++++ b/automation/build/debian/unstable.dockerfile +@@ -52,15 +52,3 @@ RUN apt-get update && \ + apt-get autoremove -y && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists* /tmp/* /var/tmp/* +- +-RUN wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key|apt-key add - +-COPY unstable-llvm-8.list /etc/apt/sources.list.d/ +- +-RUN apt-get update && \ +- apt-get --quiet --yes install \ +- clang-8 \ +- lld-8 \ +- && \ +- apt-get autoremove -y && \ +- apt-get clean && \ +- rm -rf /var/lib/apt/lists* /tmp/* /var/tmp/* +diff --git a/automation/gitlab-ci/build.yaml b/automation/gitlab-ci/build.yaml +index fdd5c76582..06a75a8c5a 100644 +--- a/automation/gitlab-ci/build.yaml ++++ b/automation/gitlab-ci/build.yaml +@@ -304,16 +304,6 @@ debian-unstable-clang-debug: + variables: + CONTAINER: debian:unstable + +-debian-unstable-clang-8: +- extends: .clang-8-x86-64-build +- variables: +- CONTAINER: debian:unstable +- +-debian-unstable-clang-8-debug: +- extends: .clang-8-x86-64-build-debug +- variables: +- CONTAINER: debian:unstable +- + debian-unstable-gcc: + extends: .gcc-x86-64-build + variables: +-- +2.40.0 + diff --git a/0023-tools-xenstore-minor-fix-of-the-migration-stream-doc.patch b/0023-tools-xenstore-minor-fix-of-the-migration-stream-doc.patch deleted file mode 100644 index 724d1d8..0000000 --- a/0023-tools-xenstore-minor-fix-of-the-migration-stream-doc.patch +++ /dev/null @@ -1,41 +0,0 @@ -From 49510071ee93905378e54664778760ed3908d447 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 11 Oct 2022 15:00:59 +0200 -Subject: [PATCH 23/87] tools/xenstore: minor fix of the migration stream doc - -Drop mentioning the non-existent read-only socket in the migration -stream description document. - -The related record field was removed in commit 8868a0e3f674 ("docs: -update the xenstore migration stream documentation). - -Signed-off-by: Juergen Gross <jgross@suse.com> -Acked-by: Julien Grall <jgrall@amazon.com> -master commit: ace1d2eff80d3d66c37ae765dae3e3cb5697e5a4 -master date: 2022-09-08 09:25:58 +0200 ---- - docs/designs/xenstore-migration.md | 8 +++----- - 1 file changed, 3 insertions(+), 5 deletions(-) - -diff --git a/docs/designs/xenstore-migration.md b/docs/designs/xenstore-migration.md -index 5f1155273ec3..78530bbb0ef4 100644 ---- a/docs/designs/xenstore-migration.md -+++ b/docs/designs/xenstore-migration.md -@@ -129,11 +129,9 @@ xenstored state that needs to be restored. - | `evtchn-fd` | The file descriptor used to communicate with | - | | the event channel driver | - --xenstored will resume in the original process context. Hence `rw-socket-fd` and --`ro-socket-fd` simply specify the file descriptors of the sockets. Sockets --are not always used, however, and so -1 will be used to denote an unused --socket. -- -+xenstored will resume in the original process context. Hence `rw-socket-fd` -+simply specifies the file descriptor of the socket. Sockets are not always -+used, however, and so -1 will be used to denote an unused socket. - - \pagebreak - --- -2.37.4 - diff --git a/0024-libs-util-Fix-parallel-build-between-flex-bison-and-.patch b/0024-libs-util-Fix-parallel-build-between-flex-bison-and-.patch new file mode 100644 index 0000000..6164878 --- /dev/null +++ b/0024-libs-util-Fix-parallel-build-between-flex-bison-and-.patch @@ -0,0 +1,50 @@ +From e4b5dff3d06421847761669a3676bef1f23e705a Mon Sep 17 00:00:00 2001 +From: Anthony PERARD <anthony.perard@citrix.com> +Date: Fri, 3 Mar 2023 08:06:23 +0100 +Subject: [PATCH 24/61] libs/util: Fix parallel build between flex/bison and CC + rules + +flex/bison generate two targets, and when those targets are +prerequisite of other rules they are considered independently by make. + +We can have a situation where the .c file is out-of-date but not the +.h, git checkout for example. In this case, if a rule only have the .h +file as prerequiste, make will procced and start to build the object. +In parallel, another target can have the .c file as prerequisite and +make will find out it need re-generating and do so, changing the .h at +the same time. This parallel task breaks the first one. + +To avoid this scenario, we put both the header and the source as +prerequisite for all object even if they only need the header. + +Reported-by: Andrew Cooper <Andrew.Cooper3@citrix.com> +Signed-off-by: Anthony PERARD <anthony.perard@citrix.com> +Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> +master commit: bf652a50fb3bb3b1b3d93db6fb79bc28f978fe75 +master date: 2023-02-09 18:26:17 +0000 +--- + tools/libs/util/Makefile | 8 ++++++++ + 1 file changed, 8 insertions(+) + +diff --git a/tools/libs/util/Makefile b/tools/libs/util/Makefile +index b739360be7..977849c056 100644 +--- a/tools/libs/util/Makefile ++++ b/tools/libs/util/Makefile +@@ -41,6 +41,14 @@ include $(XEN_ROOT)/tools/libs/libs.mk + + $(LIB_OBJS) $(PIC_OBJS): $(AUTOINCS) _paths.h + ++# Adding the .c conterparts of the headers generated by flex/bison as ++# prerequisite of all objects. ++# This is to tell make that if only the .c file is out-of-date but not the ++# header, it should still wait for the .c file to be rebuilt. ++# Otherwise, make doesn't considered "%.c %.h" as grouped targets, and will run ++# the flex/bison rules in parallel of CC rules which only need the header. ++$(LIB_OBJS) $(PIC_OBJS): libxlu_cfg_l.c libxlu_cfg_y.c libxlu_disk_l.c ++ + %.c %.h:: %.y + @rm -f $*.[ch] + $(BISON) --output=$*.c $< +-- +2.40.0 + diff --git a/0024-xen-gnttab-fix-gnttab_acquire_resource.patch b/0024-xen-gnttab-fix-gnttab_acquire_resource.patch deleted file mode 100644 index 49c0b7a..0000000 --- a/0024-xen-gnttab-fix-gnttab_acquire_resource.patch +++ /dev/null @@ -1,69 +0,0 @@ -From b9560762392c01b3ee84148c07be8017cb42dbc9 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 11 Oct 2022 15:01:22 +0200 -Subject: [PATCH 24/87] xen/gnttab: fix gnttab_acquire_resource() - -Commit 9dc46386d89d ("gnttab: work around "may be used uninitialized" -warning") was wrong, as vaddrs can legitimately be NULL in case -XENMEM_resource_grant_table_id_status was specified for a grant table -v1. This would result in crashes in debug builds due to -ASSERT_UNREACHABLE() triggering. - -Check vaddrs only to be NULL in the rc == 0 case. - -Expand the tests in tools/tests/resource to tickle this path, and verify that -using XENMEM_resource_grant_table_id_status on a v1 grant table fails. - -Fixes: 9dc46386d89d ("gnttab: work around "may be used uninitialized" warning") -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> # xen -Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> -master commit: 52daa6a8483e4fbd6757c9d1b791e23931791608 -master date: 2022-09-09 16:28:38 +0100 ---- - tools/tests/resource/test-resource.c | 15 +++++++++++++++ - xen/common/grant_table.c | 2 +- - 2 files changed, 16 insertions(+), 1 deletion(-) - -diff --git a/tools/tests/resource/test-resource.c b/tools/tests/resource/test-resource.c -index 0557f8a1b585..37dfff4dcd20 100644 ---- a/tools/tests/resource/test-resource.c -+++ b/tools/tests/resource/test-resource.c -@@ -106,6 +106,21 @@ static void test_gnttab(uint32_t domid, unsigned int nr_frames, - if ( rc ) - return fail(" Fail: Unmap grant table %d - %s\n", - errno, strerror(errno)); -+ -+ /* -+ * Verify that an attempt to map the status frames fails, as the domain is -+ * in gnttab v1 mode. -+ */ -+ res = xenforeignmemory_map_resource( -+ fh, domid, XENMEM_resource_grant_table, -+ XENMEM_resource_grant_table_id_status, 0, 1, -+ (void **)&gnttab, PROT_READ | PROT_WRITE, 0); -+ -+ if ( res ) -+ { -+ fail(" Fail: Managed to map gnttab v2 status frames in v1 mode\n"); -+ xenforeignmemory_unmap_resource(fh, res); -+ } - } - - static void test_domain_configurations(void) -diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c -index d8ca645b96ff..76272b3c8add 100644 ---- a/xen/common/grant_table.c -+++ b/xen/common/grant_table.c -@@ -4142,7 +4142,7 @@ int gnttab_acquire_resource( - * on non-error paths, and hence it needs setting to NULL at the top of the - * function. Leave some runtime safety. - */ -- if ( !vaddrs ) -+ if ( !rc && !vaddrs ) - { - ASSERT_UNREACHABLE(); - rc = -ENODATA; --- -2.37.4 - diff --git a/0025-x86-cpuid-Infrastructure-for-leaves-7-1-ecx-edx.patch b/0025-x86-cpuid-Infrastructure-for-leaves-7-1-ecx-edx.patch new file mode 100644 index 0000000..e73f62d --- /dev/null +++ b/0025-x86-cpuid-Infrastructure-for-leaves-7-1-ecx-edx.patch @@ -0,0 +1,128 @@ +From 2094f834b85d32233c76763b014bc8764c3e36b1 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper <andrew.cooper3@citrix.com> +Date: Fri, 3 Mar 2023 08:06:44 +0100 +Subject: [PATCH 25/61] x86/cpuid: Infrastructure for leaves 7:1{ecx,edx} + +We don't actually need ecx yet, but adding it in now will reduce the amount to +which leaf 7 is out of order in a featureset. + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +master commit: b4a23bf6293aadecfd03bf9e83974443e2eac9cb +master date: 2023-02-09 18:26:17 +0000 +--- + tools/misc/xen-cpuid.c | 10 ++++++++++ + xen/arch/x86/cpu/common.c | 3 ++- + xen/include/public/arch-x86/cpufeatureset.h | 4 ++++ + xen/include/xen/lib/x86/cpuid.h | 17 +++++++++++++++-- + 4 files changed, 31 insertions(+), 3 deletions(-) + +diff --git a/tools/misc/xen-cpuid.c b/tools/misc/xen-cpuid.c +index cd094427dd..3cfbbf043f 100644 +--- a/tools/misc/xen-cpuid.c ++++ b/tools/misc/xen-cpuid.c +@@ -198,6 +198,14 @@ static const char *const str_7b1[32] = + { + }; + ++static const char *const str_7c1[32] = ++{ ++}; ++ ++static const char *const str_7d1[32] = ++{ ++}; ++ + static const char *const str_7d2[32] = + { + [ 0] = "intel-psfd", +@@ -223,6 +231,8 @@ static const struct { + { "0x80000021.eax", "e21a", str_e21a }, + { "0x00000007:1.ebx", "7b1", str_7b1 }, + { "0x00000007:2.edx", "7d2", str_7d2 }, ++ { "0x00000007:1.ecx", "7c1", str_7c1 }, ++ { "0x00000007:1.edx", "7d1", str_7d1 }, + }; + + #define COL_ALIGN "18" +diff --git a/xen/arch/x86/cpu/common.c b/xen/arch/x86/cpu/common.c +index 9ce148a666..8222de6461 100644 +--- a/xen/arch/x86/cpu/common.c ++++ b/xen/arch/x86/cpu/common.c +@@ -448,7 +448,8 @@ static void generic_identify(struct cpuinfo_x86 *c) + cpuid_count(7, 1, + &c->x86_capability[FEATURESET_7a1], + &c->x86_capability[FEATURESET_7b1], +- &tmp, &tmp); ++ &c->x86_capability[FEATURESET_7c1], ++ &c->x86_capability[FEATURESET_7d1]); + if (max_subleaf >= 2) + cpuid_count(7, 2, + &tmp, &tmp, &tmp, +diff --git a/xen/include/public/arch-x86/cpufeatureset.h b/xen/include/public/arch-x86/cpufeatureset.h +index e073122140..0b01ca5e8f 100644 +--- a/xen/include/public/arch-x86/cpufeatureset.h ++++ b/xen/include/public/arch-x86/cpufeatureset.h +@@ -304,6 +304,10 @@ XEN_CPUFEATURE(NSCB, 11*32+ 6) /*A Null Selector Clears Base (and + /* Intel-defined CPU features, CPUID level 0x00000007:2.edx, word 13 */ + XEN_CPUFEATURE(INTEL_PSFD, 13*32+ 0) /*A MSR_SPEC_CTRL.PSFD */ + ++/* Intel-defined CPU features, CPUID level 0x00000007:1.ecx, word 14 */ ++ ++/* Intel-defined CPU features, CPUID level 0x00000007:1.edx, word 15 */ ++ + #endif /* XEN_CPUFEATURE */ + + /* Clean up from a default include. Close the enum (for C). */ +diff --git a/xen/include/xen/lib/x86/cpuid.h b/xen/include/xen/lib/x86/cpuid.h +index 50be07c0eb..fa98b371ee 100644 +--- a/xen/include/xen/lib/x86/cpuid.h ++++ b/xen/include/xen/lib/x86/cpuid.h +@@ -17,7 +17,9 @@ + #define FEATURESET_7a1 10 /* 0x00000007:1.eax */ + #define FEATURESET_e21a 11 /* 0x80000021.eax */ + #define FEATURESET_7b1 12 /* 0x00000007:1.ebx */ +-#define FEATURESET_7d2 13 /* 0x80000007:2.edx */ ++#define FEATURESET_7d2 13 /* 0x00000007:2.edx */ ++#define FEATURESET_7c1 14 /* 0x00000007:1.ecx */ ++#define FEATURESET_7d1 15 /* 0x00000007:1.edx */ + + struct cpuid_leaf + { +@@ -194,7 +196,14 @@ struct cpuid_policy + uint32_t _7b1; + struct { DECL_BITFIELD(7b1); }; + }; +- uint32_t /* c */:32, /* d */:32; ++ union { ++ uint32_t _7c1; ++ struct { DECL_BITFIELD(7c1); }; ++ }; ++ union { ++ uint32_t _7d1; ++ struct { DECL_BITFIELD(7d1); }; ++ }; + + /* Subleaf 2. */ + uint32_t /* a */:32, /* b */:32, /* c */:32; +@@ -343,6 +352,8 @@ static inline void cpuid_policy_to_featureset( + fs[FEATURESET_e21a] = p->extd.e21a; + fs[FEATURESET_7b1] = p->feat._7b1; + fs[FEATURESET_7d2] = p->feat._7d2; ++ fs[FEATURESET_7c1] = p->feat._7c1; ++ fs[FEATURESET_7d1] = p->feat._7d1; + } + + /* Fill in a CPUID policy from a featureset bitmap. */ +@@ -363,6 +374,8 @@ static inline void cpuid_featureset_to_policy( + p->extd.e21a = fs[FEATURESET_e21a]; + p->feat._7b1 = fs[FEATURESET_7b1]; + p->feat._7d2 = fs[FEATURESET_7d2]; ++ p->feat._7c1 = fs[FEATURESET_7c1]; ++ p->feat._7d1 = fs[FEATURESET_7d1]; + } + + static inline uint64_t cpuid_policy_xcr0_max(const struct cpuid_policy *p) +-- +2.40.0 + diff --git a/0025-x86-wire-up-VCPUOP_register_vcpu_time_memory_area-fo.patch b/0025-x86-wire-up-VCPUOP_register_vcpu_time_memory_area-fo.patch deleted file mode 100644 index 489a9c8..0000000 --- a/0025-x86-wire-up-VCPUOP_register_vcpu_time_memory_area-fo.patch +++ /dev/null @@ -1,59 +0,0 @@ -From 3f4da85ca8816f6617529c80850eaddd80ea0f1f Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Tue, 11 Oct 2022 15:01:36 +0200 -Subject: [PATCH 25/87] x86: wire up VCPUOP_register_vcpu_time_memory_area for - 32-bit guests -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Forever sinced its introduction VCPUOP_register_vcpu_time_memory_area -was available only to native domains. Linux, for example, would attempt -to use it irrespective of guest bitness (including in its so called -PVHVM mode) as long as it finds XEN_PVCLOCK_TSC_STABLE_BIT set (which we -set only for clocksource=tsc, which in turn needs engaging via command -line option). - -Fixes: a5d39947cb89 ("Allow guests to register secondary vcpu_time_info") -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Roger Pau Monné <roger.pau@citrix.com> -master commit: b726541d94bd0a80b5864d17a2cd2e6d73a3fe0a -master date: 2022-09-29 14:47:45 +0200 ---- - xen/arch/x86/x86_64/domain.c | 20 ++++++++++++++++++++ - 1 file changed, 20 insertions(+) - -diff --git a/xen/arch/x86/x86_64/domain.c b/xen/arch/x86/x86_64/domain.c -index c46dccc25a54..d51d99344796 100644 ---- a/xen/arch/x86/x86_64/domain.c -+++ b/xen/arch/x86/x86_64/domain.c -@@ -54,6 +54,26 @@ arch_compat_vcpu_op( - break; - } - -+ case VCPUOP_register_vcpu_time_memory_area: -+ { -+ struct compat_vcpu_register_time_memory_area area = { .addr.p = 0 }; -+ -+ rc = -EFAULT; -+ if ( copy_from_guest(&area.addr.h, arg, 1) ) -+ break; -+ -+ if ( area.addr.h.c != area.addr.p || -+ !compat_handle_okay(area.addr.h, 1) ) -+ break; -+ -+ rc = 0; -+ guest_from_compat_handle(v->arch.time_info_guest, area.addr.h); -+ -+ force_update_vcpu_system_time(v); -+ -+ break; -+ } -+ - case VCPUOP_get_physid: - rc = arch_do_vcpu_op(cmd, v, arg); - break; --- -2.37.4 - diff --git a/0026-x86-shskt-Disable-CET-SS-on-parts-susceptible-to-fra.patch b/0026-x86-shskt-Disable-CET-SS-on-parts-susceptible-to-fra.patch new file mode 100644 index 0000000..7fd4031 --- /dev/null +++ b/0026-x86-shskt-Disable-CET-SS-on-parts-susceptible-to-fra.patch @@ -0,0 +1,191 @@ +From 5857cc632b884711c172c5766b8fbba59f990b47 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper <andrew.cooper3@citrix.com> +Date: Fri, 3 Mar 2023 08:12:24 +0100 +Subject: [PATCH 26/61] x86/shskt: Disable CET-SS on parts susceptible to + fractured updates + +Refer to Intel SDM Rev 70 (Dec 2022), Vol3 17.2.3 "Supervisor Shadow Stack +Token". + +Architecturally, an event delivery which starts in CPL<3 and switches shadow +stack will first validate the Supervisor Shadow Stack Token (setting the busy +bit), then pushes CS/LIP/SSP. One example of this is an NMI interrupting Xen. + +Some CPUs suffer from an issue called fracturing, whereby a fault/vmexit/etc +between setting the busy bit and completing the event injection renders the +action non-restartable, because when it comes time to restart, the busy bit is +found to be already set. + +This is far more easily encountered under virt, yet it is not the fault of the +hypervisor, nor the fault of the guest kernel. The fault lies somewhere +between the architectural specification, and the uarch behaviour. + +Intel have allocated CPUID.7[1].ecx[18] CET_SSS to enumerate that supervisor +shadow stacks are safe to use. Because of how Xen lays out its shadow stacks, +fracturing is not expected to be a problem on native. + +Detect this case on boot and default to not using shstk if virtualised. +Specifying `cet=shstk` on the command line will override this heuristic and +enable shadow stacks irrespective. + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +master commit: 01e7477d1b081cff4288ff9f51ec59ee94c03ee0 +master date: 2023-02-09 18:26:17 +0000 +--- + docs/misc/xen-command-line.pandoc | 7 +++- + tools/libs/light/libxl_cpuid.c | 2 + + tools/misc/xen-cpuid.c | 1 + + xen/arch/x86/cpu/common.c | 8 +++- + xen/arch/x86/setup.c | 46 +++++++++++++++++---- + xen/include/public/arch-x86/cpufeatureset.h | 1 + + 6 files changed, 55 insertions(+), 10 deletions(-) + +diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc +index b3f60cd923..a6018fd5c3 100644 +--- a/docs/misc/xen-command-line.pandoc ++++ b/docs/misc/xen-command-line.pandoc +@@ -287,10 +287,15 @@ can be maintained with the pv-shim mechanism. + protection. + + The option is available when `CONFIG_XEN_SHSTK` is compiled in, and +- defaults to `true` on hardware supporting CET-SS. Specifying ++ generally defaults to `true` on hardware supporting CET-SS. Specifying + `cet=no-shstk` will cause Xen not to use Shadow Stacks even when support + is available in hardware. + ++ Some hardware suffers from an issue known as Supervisor Shadow Stack ++ Fracturing. On such hardware, Xen will default to not using Shadow Stacks ++ when virtualised. Specifying `cet=shstk` will override this heuristic and ++ enable Shadow Stacks unilaterally. ++ + * The `ibt=` boolean controls whether Xen uses Indirect Branch Tracking for + its own protection. + +diff --git a/tools/libs/light/libxl_cpuid.c b/tools/libs/light/libxl_cpuid.c +index 691d5c6b2a..b4eacc2bd5 100644 +--- a/tools/libs/light/libxl_cpuid.c ++++ b/tools/libs/light/libxl_cpuid.c +@@ -234,6 +234,8 @@ int libxl_cpuid_parse_config(libxl_cpuid_policy_list *cpuid, const char* str) + {"fsrs", 0x00000007, 1, CPUID_REG_EAX, 11, 1}, + {"fsrcs", 0x00000007, 1, CPUID_REG_EAX, 12, 1}, + ++ {"cet-sss", 0x00000007, 1, CPUID_REG_EDX, 18, 1}, ++ + {"intel-psfd", 0x00000007, 2, CPUID_REG_EDX, 0, 1}, + + {"lahfsahf", 0x80000001, NA, CPUID_REG_ECX, 0, 1}, +diff --git a/tools/misc/xen-cpuid.c b/tools/misc/xen-cpuid.c +index 3cfbbf043f..db9c4ed8fc 100644 +--- a/tools/misc/xen-cpuid.c ++++ b/tools/misc/xen-cpuid.c +@@ -204,6 +204,7 @@ static const char *const str_7c1[32] = + + static const char *const str_7d1[32] = + { ++ [18] = "cet-sss", + }; + + static const char *const str_7d2[32] = +diff --git a/xen/arch/x86/cpu/common.c b/xen/arch/x86/cpu/common.c +index 8222de6461..e1fc034ce6 100644 +--- a/xen/arch/x86/cpu/common.c ++++ b/xen/arch/x86/cpu/common.c +@@ -344,9 +344,15 @@ void __init early_cpu_init(void) + c->x86_model, c->x86_model, c->x86_mask, eax); + + if (c->cpuid_level >= 7) { +- cpuid_count(7, 0, &eax, &ebx, &ecx, &edx); ++ uint32_t max_subleaf; ++ ++ cpuid_count(7, 0, &max_subleaf, &ebx, &ecx, &edx); + c->x86_capability[cpufeat_word(X86_FEATURE_CET_SS)] = ecx; + c->x86_capability[cpufeat_word(X86_FEATURE_CET_IBT)] = edx; ++ ++ if (max_subleaf >= 1) ++ cpuid_count(7, 1, &eax, &ebx, &ecx, ++ &c->x86_capability[FEATURESET_7d1]); + } + + eax = cpuid_eax(0x80000000); +diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c +index 70b37d8afe..f0de805780 100644 +--- a/xen/arch/x86/setup.c ++++ b/xen/arch/x86/setup.c +@@ -98,11 +98,7 @@ unsigned long __initdata highmem_start; + size_param("highmem-start", highmem_start); + #endif + +-#ifdef CONFIG_XEN_SHSTK +-static bool __initdata opt_xen_shstk = true; +-#else +-#define opt_xen_shstk false +-#endif ++static int8_t __initdata opt_xen_shstk = -IS_ENABLED(CONFIG_XEN_SHSTK); + + #ifdef CONFIG_XEN_IBT + static bool __initdata opt_xen_ibt = true; +@@ -1113,11 +1109,45 @@ void __init noreturn __start_xen(unsigned long mbi_p) + early_cpu_init(); + + /* Choose shadow stack early, to set infrastructure up appropriately. */ +- if ( opt_xen_shstk && boot_cpu_has(X86_FEATURE_CET_SS) ) ++ if ( !boot_cpu_has(X86_FEATURE_CET_SS) ) ++ opt_xen_shstk = 0; ++ ++ if ( opt_xen_shstk ) + { +- printk("Enabling Supervisor Shadow Stacks\n"); ++ /* ++ * Some CPUs suffer from Shadow Stack Fracturing, an issue whereby a ++ * fault/VMExit/etc between setting a Supervisor Busy bit and the ++ * event delivery completing renders the operation non-restartable. ++ * On restart, event delivery will find the Busy bit already set. ++ * ++ * This is a problem on bare metal, but outside of synthetic cases or ++ * a very badly timed #MC, it's not believed to be a problem. It is a ++ * much bigger problem under virt, because we can VMExit for a number ++ * of legitimate reasons and tickle this bug. ++ * ++ * CPUs with this addressed enumerate CET-SSS to indicate that ++ * supervisor shadow stacks are now safe to use. ++ */ ++ bool cpu_has_bug_shstk_fracture = ++ boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && ++ !boot_cpu_has(X86_FEATURE_CET_SSS); + +- setup_force_cpu_cap(X86_FEATURE_XEN_SHSTK); ++ /* ++ * On bare metal, assume that Xen won't be impacted by shstk ++ * fracturing problems. Under virt, be more conservative and disable ++ * shstk by default. ++ */ ++ if ( opt_xen_shstk == -1 ) ++ opt_xen_shstk = ++ cpu_has_hypervisor ? !cpu_has_bug_shstk_fracture ++ : true; ++ ++ if ( opt_xen_shstk ) ++ { ++ printk("Enabling Supervisor Shadow Stacks\n"); ++ ++ setup_force_cpu_cap(X86_FEATURE_XEN_SHSTK); ++ } + } + + if ( opt_xen_ibt && boot_cpu_has(X86_FEATURE_CET_IBT) ) +diff --git a/xen/include/public/arch-x86/cpufeatureset.h b/xen/include/public/arch-x86/cpufeatureset.h +index 0b01ca5e8f..4832ad09df 100644 +--- a/xen/include/public/arch-x86/cpufeatureset.h ++++ b/xen/include/public/arch-x86/cpufeatureset.h +@@ -307,6 +307,7 @@ XEN_CPUFEATURE(INTEL_PSFD, 13*32+ 0) /*A MSR_SPEC_CTRL.PSFD */ + /* Intel-defined CPU features, CPUID level 0x00000007:1.ecx, word 14 */ + + /* Intel-defined CPU features, CPUID level 0x00000007:1.edx, word 15 */ ++XEN_CPUFEATURE(CET_SSS, 15*32+18) /* CET Supervisor Shadow Stacks safe to use */ + + #endif /* XEN_CPUFEATURE */ + +-- +2.40.0 + diff --git a/0026-x86-vpmu-Fix-race-condition-in-vpmu_load.patch b/0026-x86-vpmu-Fix-race-condition-in-vpmu_load.patch deleted file mode 100644 index 910f573..0000000 --- a/0026-x86-vpmu-Fix-race-condition-in-vpmu_load.patch +++ /dev/null @@ -1,97 +0,0 @@ -From 1bce7fb1f702da4f7a749c6f1457ecb20bf74fca Mon Sep 17 00:00:00 2001 -From: Tamas K Lengyel <tamas.lengyel@intel.com> -Date: Tue, 11 Oct 2022 15:01:48 +0200 -Subject: [PATCH 26/87] x86/vpmu: Fix race-condition in vpmu_load - -The vPMU code-bases attempts to perform an optimization on saving/reloading the -PMU context by keeping track of what vCPU ran on each pCPU. When a pCPU is -getting scheduled, checks if the previous vCPU isn't the current one. If so, -attempts a call to vpmu_save_force. Unfortunately if the previous vCPU is -already getting scheduled to run on another pCPU its state will be already -runnable, which results in an ASSERT failure. - -Fix this by always performing a pmu context save in vpmu_save when called from -vpmu_switch_from, and do a vpmu_load when called from vpmu_switch_to. - -While this presents a minimal overhead in case the same vCPU is getting -rescheduled on the same pCPU, the ASSERT failure is avoided and the code is a -lot easier to reason about. - -Signed-off-by: Tamas K Lengyel <tamas.lengyel@intel.com> -Acked-by: Jan Beulich <jbeulich@suse.com> -master commit: defa4e51d20a143bdd4395a075bf0933bb38a9a4 -master date: 2022-09-30 09:53:49 +0200 ---- - xen/arch/x86/cpu/vpmu.c | 42 ++++------------------------------------- - 1 file changed, 4 insertions(+), 38 deletions(-) - -diff --git a/xen/arch/x86/cpu/vpmu.c b/xen/arch/x86/cpu/vpmu.c -index 16e91a3694fe..b6c2ec3cd047 100644 ---- a/xen/arch/x86/cpu/vpmu.c -+++ b/xen/arch/x86/cpu/vpmu.c -@@ -368,58 +368,24 @@ void vpmu_save(struct vcpu *v) - vpmu->last_pcpu = pcpu; - per_cpu(last_vcpu, pcpu) = v; - -+ vpmu_set(vpmu, VPMU_CONTEXT_SAVE); -+ - if ( vpmu->arch_vpmu_ops ) - if ( vpmu->arch_vpmu_ops->arch_vpmu_save(v, 0) ) - vpmu_reset(vpmu, VPMU_CONTEXT_LOADED); - -+ vpmu_reset(vpmu, VPMU_CONTEXT_SAVE); -+ - apic_write(APIC_LVTPC, PMU_APIC_VECTOR | APIC_LVT_MASKED); - } - - int vpmu_load(struct vcpu *v, bool_t from_guest) - { - struct vpmu_struct *vpmu = vcpu_vpmu(v); -- int pcpu = smp_processor_id(); -- struct vcpu *prev = NULL; - - if ( !vpmu_is_set(vpmu, VPMU_CONTEXT_ALLOCATED) ) - return 0; - -- /* First time this VCPU is running here */ -- if ( vpmu->last_pcpu != pcpu ) -- { -- /* -- * Get the context from last pcpu that we ran on. Note that if another -- * VCPU is running there it must have saved this VPCU's context before -- * startig to run (see below). -- * There should be no race since remote pcpu will disable interrupts -- * before saving the context. -- */ -- if ( vpmu_is_set(vpmu, VPMU_CONTEXT_LOADED) ) -- { -- on_selected_cpus(cpumask_of(vpmu->last_pcpu), -- vpmu_save_force, (void *)v, 1); -- vpmu_reset(vpmu, VPMU_CONTEXT_LOADED); -- } -- } -- -- /* Prevent forced context save from remote CPU */ -- local_irq_disable(); -- -- prev = per_cpu(last_vcpu, pcpu); -- -- if ( prev != v && prev ) -- { -- vpmu = vcpu_vpmu(prev); -- -- /* Someone ran here before us */ -- vpmu_save_force(prev); -- vpmu_reset(vpmu, VPMU_CONTEXT_LOADED); -- -- vpmu = vcpu_vpmu(v); -- } -- -- local_irq_enable(); -- - /* Only when PMU is counting, we load PMU context immediately. */ - if ( !vpmu_is_set(vpmu, VPMU_RUNNING) || - (!has_vlapic(vpmu_vcpu(vpmu)->domain) && --- -2.37.4 - diff --git a/0027-arm-p2m-Rework-p2m_init.patch b/0027-arm-p2m-Rework-p2m_init.patch deleted file mode 100644 index 0668899..0000000 --- a/0027-arm-p2m-Rework-p2m_init.patch +++ /dev/null @@ -1,88 +0,0 @@ -From 86cb37447548420e41ff953a7372972f6154d6d1 Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Tue, 25 Oct 2022 09:21:11 +0000 -Subject: [PATCH 27/87] arm/p2m: Rework p2m_init() - -p2m_init() is mostly trivial initialisation, but has two fallible operations -which are on either side of the backpointer trigger for teardown to take -actions. - -p2m_free_vmid() is idempotent with a failed p2m_alloc_vmid(), so rearrange -p2m_init() to perform all trivial setup, then set the backpointer, then -perform all fallible setup. - -This will simplify a future bugfix which needs to add a third fallible -operation. - -No practical change. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -Reviewed-by: Bertrand Marquis <bertrand.marquis@arm.com> -(cherry picked from commit: 3783e583319fa1ce75e414d851f0fde191a14753) ---- - xen/arch/arm/p2m.c | 24 ++++++++++++------------ - 1 file changed, 12 insertions(+), 12 deletions(-) - -diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c -index b2d856a801af..4f7d923ad9f8 100644 ---- a/xen/arch/arm/p2m.c -+++ b/xen/arch/arm/p2m.c -@@ -1730,7 +1730,7 @@ void p2m_final_teardown(struct domain *d) - int p2m_init(struct domain *d) - { - struct p2m_domain *p2m = p2m_get_hostp2m(d); -- int rc = 0; -+ int rc; - unsigned int cpu; - - rwlock_init(&p2m->lock); -@@ -1739,11 +1739,6 @@ int p2m_init(struct domain *d) - INIT_PAGE_LIST_HEAD(&d->arch.paging.p2m_freelist); - - p2m->vmid = INVALID_VMID; -- -- rc = p2m_alloc_vmid(d); -- if ( rc != 0 ) -- return rc; -- - p2m->max_mapped_gfn = _gfn(0); - p2m->lowest_mapped_gfn = _gfn(ULONG_MAX); - -@@ -1759,8 +1754,6 @@ int p2m_init(struct domain *d) - p2m->clean_pte = is_iommu_enabled(d) && - !iommu_has_feature(d, IOMMU_FEAT_COHERENT_WALK); - -- rc = p2m_alloc_table(d); -- - /* - * Make sure that the type chosen to is able to store the an vCPU ID - * between 0 and the maximum of virtual CPUS supported as long as -@@ -1773,13 +1766,20 @@ int p2m_init(struct domain *d) - p2m->last_vcpu_ran[cpu] = INVALID_VCPU_ID; - - /* -- * Besides getting a domain when we only have the p2m in hand, -- * the back pointer to domain is also used in p2m_teardown() -- * as an end-of-initialization indicator. -+ * "Trivial" initialisation is now complete. Set the backpointer so -+ * p2m_teardown() and friends know to do something. - */ - p2m->domain = d; - -- return rc; -+ rc = p2m_alloc_vmid(d); -+ if ( rc ) -+ return rc; -+ -+ rc = p2m_alloc_table(d); -+ if ( rc ) -+ return rc; -+ -+ return 0; - } - - /* --- -2.37.4 - diff --git a/0027-credit2-respect-credit2_runqueue-all-when-arranging-.patch b/0027-credit2-respect-credit2_runqueue-all-when-arranging-.patch new file mode 100644 index 0000000..6c8ab5c --- /dev/null +++ b/0027-credit2-respect-credit2_runqueue-all-when-arranging-.patch @@ -0,0 +1,69 @@ +From 366693226ce025e8721626609b4b43b9061b55f5 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?= + <marmarek@invisiblethingslab.com> +Date: Fri, 3 Mar 2023 08:13:20 +0100 +Subject: [PATCH 27/61] credit2: respect credit2_runqueue=all when arranging + runqueues +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Documentation for credit2_runqueue=all says it should create one queue +for all pCPUs on the host. But since introduction +sched_credit2_max_cpus_runqueue, it actually created separate runqueue +per socket, even if the CPUs count is below +sched_credit2_max_cpus_runqueue. + +Adjust the condition to skip syblink check in case of +credit2_runqueue=all. + +Fixes: 8e2aa76dc167 ("xen: credit2: limit the max number of CPUs in a runqueue") +Signed-off-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com> +Reviewed-by: Juergen Gross <jgross@suse.com> +master commit: 1f5747ee929fbbcae58d7234c6c38a77495d0cfe +master date: 2023-02-15 16:12:42 +0100 +--- + docs/misc/xen-command-line.pandoc | 5 +++++ + xen/common/sched/credit2.c | 9 +++++++-- + 2 files changed, 12 insertions(+), 2 deletions(-) + +diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc +index a6018fd5c3..7b7a619c1b 100644 +--- a/docs/misc/xen-command-line.pandoc ++++ b/docs/misc/xen-command-line.pandoc +@@ -724,6 +724,11 @@ Available alternatives, with their meaning, are: + * `all`: just one runqueue shared by all the logical pCPUs of + the host + ++Regardless of the above choice, Xen attempts to respect ++`sched_credit2_max_cpus_runqueue` limit, which may mean more than one runqueue ++for the `all` value. If that isn't intended, raise ++the `sched_credit2_max_cpus_runqueue` value. ++ + ### dbgp + > `= ehci[ <integer> | @pci<bus>:<slot>.<func> ]` + +diff --git a/xen/common/sched/credit2.c b/xen/common/sched/credit2.c +index 6396b38e04..1a240f417a 100644 +--- a/xen/common/sched/credit2.c ++++ b/xen/common/sched/credit2.c +@@ -996,9 +996,14 @@ cpu_add_to_runqueue(const struct scheduler *ops, unsigned int cpu) + * + * Otherwise, let's try to make sure that siblings stay in the + * same runqueue, pretty much under any cinrcumnstances. ++ * ++ * Furthermore, try to respect credit2_runqueue=all, as long as ++ * max_cpus_runq isn't violated. + */ +- if ( rqd->refcnt < max_cpus_runq && (ops->cpupool->gran != SCHED_GRAN_cpu || +- cpu_runqueue_siblings_match(rqd, cpu, max_cpus_runq)) ) ++ if ( rqd->refcnt < max_cpus_runq && ++ (ops->cpupool->gran != SCHED_GRAN_cpu || ++ cpu_runqueue_siblings_match(rqd, cpu, max_cpus_runq) || ++ opt_runqueue == OPT_RUNQUEUE_ALL) ) + { + /* + * This runqueue is ok, but as we said, we also want an even +-- +2.40.0 + diff --git a/0028-x86-ucode-AMD-apply-the-patch-early-on-every-logical.patch b/0028-x86-ucode-AMD-apply-the-patch-early-on-every-logical.patch new file mode 100644 index 0000000..55df5d0 --- /dev/null +++ b/0028-x86-ucode-AMD-apply-the-patch-early-on-every-logical.patch @@ -0,0 +1,152 @@ +From d1c6934b41f8288ea3169e63bce8a7eea9d9c549 Mon Sep 17 00:00:00 2001 +From: Sergey Dyasli <sergey.dyasli@citrix.com> +Date: Fri, 3 Mar 2023 08:14:01 +0100 +Subject: [PATCH 28/61] x86/ucode/AMD: apply the patch early on every logical + thread + +The original issue has been reported on AMD Bulldozer-based CPUs where +ucode loading loses the LWP feature bit in order to gain the IBPB bit. +LWP disabling is per-SMT/CMT core modification and needs to happen on +each sibling thread despite the shared microcode engine. Otherwise, +logical CPUs will end up with different cpuid capabilities. +Link: https://bugzilla.kernel.org/show_bug.cgi?id=216211 + +Guests running under Xen happen to be not affected because of levelling +logic for the feature masking/override MSRs which causes the LWP bit to +fall out and hides the issue. The latest recommendation from AMD, after +discussing this bug, is to load ucode on every logical CPU. + +In Linux kernel this issue has been addressed by e7ad18d1169c +("x86/microcode/AMD: Apply the patch early on every logical thread"). +Follow the same approach in Xen. + +Introduce SAME_UCODE match result and use it for early AMD ucode +loading. Take this opportunity and move opt_ucode_allow_same out of +compare_revisions() to the relevant callers and also modify the warning +message based on it. Intel's side of things is modified for consistency +but provides no functional change. + +Signed-off-by: Sergey Dyasli <sergey.dyasli@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +master commit: f4ef8a41b80831db2136bdaff9f946a1a4b051e7 +master date: 2023-02-21 15:08:05 +0100 +--- + xen/arch/x86/cpu/microcode/amd.c | 11 ++++++++--- + xen/arch/x86/cpu/microcode/core.c | 24 ++++++++++++++++-------- + xen/arch/x86/cpu/microcode/intel.c | 10 +++++++--- + xen/arch/x86/cpu/microcode/private.h | 3 ++- + 4 files changed, 33 insertions(+), 15 deletions(-) + +diff --git a/xen/arch/x86/cpu/microcode/amd.c b/xen/arch/x86/cpu/microcode/amd.c +index fe92e594f1..52182c1a23 100644 +--- a/xen/arch/x86/cpu/microcode/amd.c ++++ b/xen/arch/x86/cpu/microcode/amd.c +@@ -176,8 +176,8 @@ static enum microcode_match_result compare_revisions( + if ( new_rev > old_rev ) + return NEW_UCODE; + +- if ( opt_ucode_allow_same && new_rev == old_rev ) +- return NEW_UCODE; ++ if ( new_rev == old_rev ) ++ return SAME_UCODE; + + return OLD_UCODE; + } +@@ -220,8 +220,13 @@ static int apply_microcode(const struct microcode_patch *patch) + unsigned int cpu = smp_processor_id(); + struct cpu_signature *sig = &per_cpu(cpu_sig, cpu); + uint32_t rev, old_rev = sig->rev; ++ enum microcode_match_result result = microcode_fits(patch); + +- if ( microcode_fits(patch) != NEW_UCODE ) ++ /* ++ * Allow application of the same revision to pick up SMT-specific changes ++ * even if the revision of the other SMT thread is already up-to-date. ++ */ ++ if ( result != NEW_UCODE && result != SAME_UCODE ) + return -EINVAL; + + if ( check_final_patch_levels(sig) ) +diff --git a/xen/arch/x86/cpu/microcode/core.c b/xen/arch/x86/cpu/microcode/core.c +index ac3ceb567c..ceec1f1edc 100644 +--- a/xen/arch/x86/cpu/microcode/core.c ++++ b/xen/arch/x86/cpu/microcode/core.c +@@ -608,16 +608,24 @@ static long microcode_update_helper(void *data) + * that ucode revision. + */ + spin_lock(µcode_mutex); +- if ( microcode_cache && +- microcode_ops->compare_patch(patch, microcode_cache) != NEW_UCODE ) ++ if ( microcode_cache ) + { +- spin_unlock(µcode_mutex); +- printk(XENLOG_WARNING "microcode: couldn't find any newer revision " +- "in the provided blob!\n"); +- microcode_free_patch(patch); +- ret = -ENOENT; ++ enum microcode_match_result result; + +- goto put; ++ result = microcode_ops->compare_patch(patch, microcode_cache); ++ ++ if ( result != NEW_UCODE && ++ !(opt_ucode_allow_same && result == SAME_UCODE) ) ++ { ++ spin_unlock(µcode_mutex); ++ printk(XENLOG_WARNING ++ "microcode: couldn't find any newer%s revision in the provided blob!\n", ++ opt_ucode_allow_same ? " (or the same)" : ""); ++ microcode_free_patch(patch); ++ ret = -ENOENT; ++ ++ goto put; ++ } + } + spin_unlock(µcode_mutex); + +diff --git a/xen/arch/x86/cpu/microcode/intel.c b/xen/arch/x86/cpu/microcode/intel.c +index f6d01490e0..c26fbb8cc7 100644 +--- a/xen/arch/x86/cpu/microcode/intel.c ++++ b/xen/arch/x86/cpu/microcode/intel.c +@@ -232,8 +232,8 @@ static enum microcode_match_result compare_revisions( + if ( new_rev > old_rev ) + return NEW_UCODE; + +- if ( opt_ucode_allow_same && new_rev == old_rev ) +- return NEW_UCODE; ++ if ( new_rev == old_rev ) ++ return SAME_UCODE; + + /* + * Treat pre-production as always applicable - anyone using pre-production +@@ -290,8 +290,12 @@ static int apply_microcode(const struct microcode_patch *patch) + unsigned int cpu = smp_processor_id(); + struct cpu_signature *sig = &this_cpu(cpu_sig); + uint32_t rev, old_rev = sig->rev; ++ enum microcode_match_result result; ++ ++ result = microcode_update_match(patch); + +- if ( microcode_update_match(patch) != NEW_UCODE ) ++ if ( result != NEW_UCODE && ++ !(opt_ucode_allow_same && result == SAME_UCODE) ) + return -EINVAL; + + wbinvd(); +diff --git a/xen/arch/x86/cpu/microcode/private.h b/xen/arch/x86/cpu/microcode/private.h +index c085a10268..feafab0677 100644 +--- a/xen/arch/x86/cpu/microcode/private.h ++++ b/xen/arch/x86/cpu/microcode/private.h +@@ -6,7 +6,8 @@ + extern bool opt_ucode_allow_same; + + enum microcode_match_result { +- OLD_UCODE, /* signature matched, but revision id is older or equal */ ++ OLD_UCODE, /* signature matched, but revision id is older */ ++ SAME_UCODE, /* signature matched, but revision id is the same */ + NEW_UCODE, /* signature matched, but revision id is newer */ + MIS_UCODE, /* signature mismatched */ + }; +-- +2.40.0 + diff --git a/0028-xen-arm-p2m-Populate-pages-for-GICv2-mapping-in-p2m_.patch b/0028-xen-arm-p2m-Populate-pages-for-GICv2-mapping-in-p2m_.patch deleted file mode 100644 index 7bc6c36..0000000 --- a/0028-xen-arm-p2m-Populate-pages-for-GICv2-mapping-in-p2m_.patch +++ /dev/null @@ -1,169 +0,0 @@ -From e5a5bdeba6a0c3eacd2ba39c1ee36b3c54e77dca Mon Sep 17 00:00:00 2001 -From: Henry Wang <Henry.Wang@arm.com> -Date: Tue, 25 Oct 2022 09:21:12 +0000 -Subject: [PATCH 28/87] xen/arm: p2m: Populate pages for GICv2 mapping in - p2m_init() - -Hardware using GICv2 needs to create a P2M mapping of 8KB GICv2 area -when the domain is created. Considering the worst case of page tables -which requires 6 P2M pages as the two pages will be consecutive but not -necessarily in the same L3 page table and keep a buffer, populate 16 -pages as the default value to the P2M pages pool in p2m_init() at the -domain creation stage to satisfy the GICv2 requirement. For GICv3, the -above-mentioned P2M mapping is not necessary, but since the allocated -16 pages here would not be lost, hence populate these pages -unconditionally. - -With the default 16 P2M pages populated, there would be a case that -failures would happen in the domain creation with P2M pages already in -use. To properly free the P2M for this case, firstly support the -optionally preemption of p2m_teardown(), then call p2m_teardown() and -p2m_set_allocation(d, 0, NULL) non-preemptively in p2m_final_teardown(). -As non-preemptive p2m_teardown() should only return 0, use a -BUG_ON to confirm that. - -Since p2m_final_teardown() is called either after -domain_relinquish_resources() where relinquish_p2m_mapping() has been -called, or from failure path of domain_create()/arch_domain_create() -where mappings that require p2m_put_l3_page() should never be created, -relinquish_p2m_mapping() is not added in p2m_final_teardown(), add -in-code comments to refer this. - -Fixes: cbea5a1149ca ("xen/arm: Allocate and free P2M pages from the P2M pool") -Suggested-by: Julien Grall <jgrall@amazon.com> -Signed-off-by: Henry Wang <Henry.Wang@arm.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -Reviewed-by: Bertrand Marquis <bertrand.marquis@arm.com> -(cherry picked from commit: c7cff1188802646eaa38e918e5738da0e84949be) ---- - xen/arch/arm/domain.c | 2 +- - xen/arch/arm/p2m.c | 34 ++++++++++++++++++++++++++++++++-- - xen/include/asm-arm/p2m.h | 14 ++++++++++---- - 3 files changed, 43 insertions(+), 7 deletions(-) - -diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c -index a818f33a1afa..c7feaa323ad1 100644 ---- a/xen/arch/arm/domain.c -+++ b/xen/arch/arm/domain.c -@@ -1059,7 +1059,7 @@ int domain_relinquish_resources(struct domain *d) - return ret; - - PROGRESS(p2m): -- ret = p2m_teardown(d); -+ ret = p2m_teardown(d, true); - if ( ret ) - return ret; - -diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c -index 4f7d923ad9f8..6f87e17c1d08 100644 ---- a/xen/arch/arm/p2m.c -+++ b/xen/arch/arm/p2m.c -@@ -1661,7 +1661,7 @@ static void p2m_free_vmid(struct domain *d) - spin_unlock(&vmid_alloc_lock); - } - --int p2m_teardown(struct domain *d) -+int p2m_teardown(struct domain *d, bool allow_preemption) - { - struct p2m_domain *p2m = p2m_get_hostp2m(d); - unsigned long count = 0; -@@ -1669,6 +1669,9 @@ int p2m_teardown(struct domain *d) - unsigned int i; - int rc = 0; - -+ if ( page_list_empty(&p2m->pages) ) -+ return 0; -+ - p2m_write_lock(p2m); - - /* -@@ -1692,7 +1695,7 @@ int p2m_teardown(struct domain *d) - p2m_free_page(p2m->domain, pg); - count++; - /* Arbitrarily preempt every 512 iterations */ -- if ( !(count % 512) && hypercall_preempt_check() ) -+ if ( allow_preemption && !(count % 512) && hypercall_preempt_check() ) - { - rc = -ERESTART; - break; -@@ -1712,7 +1715,20 @@ void p2m_final_teardown(struct domain *d) - if ( !p2m->domain ) - return; - -+ /* -+ * No need to call relinquish_p2m_mapping() here because -+ * p2m_final_teardown() is called either after domain_relinquish_resources() -+ * where relinquish_p2m_mapping() has been called, or from failure path of -+ * domain_create()/arch_domain_create() where mappings that require -+ * p2m_put_l3_page() should never be created. For the latter case, also see -+ * comment on top of the p2m_set_entry() for more info. -+ */ -+ -+ BUG_ON(p2m_teardown(d, false)); - ASSERT(page_list_empty(&p2m->pages)); -+ -+ while ( p2m_teardown_allocation(d) == -ERESTART ) -+ continue; /* No preemption support here */ - ASSERT(page_list_empty(&d->arch.paging.p2m_freelist)); - - if ( p2m->root ) -@@ -1779,6 +1795,20 @@ int p2m_init(struct domain *d) - if ( rc ) - return rc; - -+ /* -+ * Hardware using GICv2 needs to create a P2M mapping of 8KB GICv2 area -+ * when the domain is created. Considering the worst case for page -+ * tables and keep a buffer, populate 16 pages to the P2M pages pool here. -+ * For GICv3, the above-mentioned P2M mapping is not necessary, but since -+ * the allocated 16 pages here would not be lost, hence populate these -+ * pages unconditionally. -+ */ -+ spin_lock(&d->arch.paging.lock); -+ rc = p2m_set_allocation(d, 16, NULL); -+ spin_unlock(&d->arch.paging.lock); -+ if ( rc ) -+ return rc; -+ - return 0; - } - -diff --git a/xen/include/asm-arm/p2m.h b/xen/include/asm-arm/p2m.h -index c9598740bd02..b2725206e8de 100644 ---- a/xen/include/asm-arm/p2m.h -+++ b/xen/include/asm-arm/p2m.h -@@ -194,14 +194,18 @@ int p2m_init(struct domain *d); - - /* - * The P2M resources are freed in two parts: -- * - p2m_teardown() will be called when relinquish the resources. It -- * will free large resources (e.g. intermediate page-tables) that -- * requires preemption. -+ * - p2m_teardown() will be called preemptively when relinquish the -+ * resources, in which case it will free large resources (e.g. intermediate -+ * page-tables) that requires preemption. - * - p2m_final_teardown() will be called when domain struct is been - * freed. This *cannot* be preempted and therefore one small - * resources should be freed here. -+ * Note that p2m_final_teardown() will also call p2m_teardown(), to properly -+ * free the P2M when failures happen in the domain creation with P2M pages -+ * already in use. In this case p2m_teardown() is called non-preemptively and -+ * p2m_teardown() will always return 0. - */ --int p2m_teardown(struct domain *d); -+int p2m_teardown(struct domain *d, bool allow_preemption); - void p2m_final_teardown(struct domain *d); - - /* -@@ -266,6 +270,8 @@ mfn_t p2m_get_entry(struct p2m_domain *p2m, gfn_t gfn, - /* - * Direct set a p2m entry: only for use by the P2M code. - * The P2M write lock should be taken. -+ * TODO: Add a check in __p2m_set_entry() to avoid creating a mapping in -+ * arch_domain_create() that requires p2m_put_l3_page() to be called. - */ - int p2m_set_entry(struct p2m_domain *p2m, - gfn_t sgfn, --- -2.37.4 - diff --git a/0029-x86-perform-mem_sharing-teardown-before-paging-teard.patch b/0029-x86-perform-mem_sharing-teardown-before-paging-teard.patch new file mode 100644 index 0000000..c96f44e --- /dev/null +++ b/0029-x86-perform-mem_sharing-teardown-before-paging-teard.patch @@ -0,0 +1,111 @@ +From 700320a79297fb5087f7dd540424c468b2d2cffe Mon Sep 17 00:00:00 2001 +From: Tamas K Lengyel <tamas@tklengyel.com> +Date: Fri, 3 Mar 2023 08:14:25 +0100 +Subject: [PATCH 29/61] x86: perform mem_sharing teardown before paging + teardown + +An assert failure has been observed in p2m_teardown when performing vm +forking and then destroying the forked VM (p2m-basic.c:173). The assert +checks whether the domain's shared pages counter is 0. According to the +patch that originally added the assert (7bedbbb5c31) the p2m_teardown +should only happen after mem_sharing already relinquished all shared pages. + +In this patch we flip the order in which relinquish ops are called to avoid +tripping the assert. Conceptually sharing being torn down makes sense to +happen before paging is torn down. + +Fixes: e7aa55c0aab3 ("x86/p2m: free the paging memory pool preemptively") +Signed-off-by: Tamas K Lengyel <tamas@tklengyel.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +master commit: 2869349f0cb3a89dcbf1f1b30371f58df6309312 +master date: 2023-02-23 12:35:48 +0100 +--- + xen/arch/x86/domain.c | 56 ++++++++++++++++++++++--------------------- + 1 file changed, 29 insertions(+), 27 deletions(-) + +diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c +index 3080cde62b..6eeb248908 100644 +--- a/xen/arch/x86/domain.c ++++ b/xen/arch/x86/domain.c +@@ -2343,9 +2343,9 @@ int domain_relinquish_resources(struct domain *d) + + enum { + PROG_iommu_pagetables = 1, ++ PROG_shared, + PROG_paging, + PROG_vcpu_pagetables, +- PROG_shared, + PROG_xen, + PROG_l4, + PROG_l3, +@@ -2364,6 +2364,34 @@ int domain_relinquish_resources(struct domain *d) + if ( ret ) + return ret; + ++#ifdef CONFIG_MEM_SHARING ++ PROGRESS(shared): ++ ++ if ( is_hvm_domain(d) ) ++ { ++ /* ++ * If the domain has shared pages, relinquish them allowing ++ * for preemption. ++ */ ++ ret = relinquish_shared_pages(d); ++ if ( ret ) ++ return ret; ++ ++ /* ++ * If the domain is forked, decrement the parent's pause count ++ * and release the domain. ++ */ ++ if ( mem_sharing_is_fork(d) ) ++ { ++ struct domain *parent = d->parent; ++ ++ d->parent = NULL; ++ domain_unpause(parent); ++ put_domain(parent); ++ } ++ } ++#endif ++ + PROGRESS(paging): + + /* Tear down paging-assistance stuff. */ +@@ -2404,32 +2432,6 @@ int domain_relinquish_resources(struct domain *d) + d->arch.auto_unmask = 0; + } + +-#ifdef CONFIG_MEM_SHARING +- PROGRESS(shared): +- +- if ( is_hvm_domain(d) ) +- { +- /* If the domain has shared pages, relinquish them allowing +- * for preemption. */ +- ret = relinquish_shared_pages(d); +- if ( ret ) +- return ret; +- +- /* +- * If the domain is forked, decrement the parent's pause count +- * and release the domain. +- */ +- if ( mem_sharing_is_fork(d) ) +- { +- struct domain *parent = d->parent; +- +- d->parent = NULL; +- domain_unpause(parent); +- put_domain(parent); +- } +- } +-#endif +- + spin_lock(&d->page_alloc_lock); + page_list_splice(&d->arch.relmem_list, &d->page_list); + INIT_PAGE_LIST_HEAD(&d->arch.relmem_list); +-- +2.40.0 + diff --git a/0029-x86emul-respect-NSCB.patch b/0029-x86emul-respect-NSCB.patch deleted file mode 100644 index 08785b7..0000000 --- a/0029-x86emul-respect-NSCB.patch +++ /dev/null @@ -1,40 +0,0 @@ -From 5dae06578cd5dcc312175b00ed6836a85732438d Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Mon, 31 Oct 2022 13:19:35 +0100 -Subject: [PATCH 29/87] x86emul: respect NSCB - -protmode_load_seg() would better adhere to that "feature" of clearing -base (and limit) during NULL selector loads. - -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> -master commit: 87a20c98d9f0f422727fe9b4b9e22c2c43a5cd9c -master date: 2022-10-11 14:30:41 +0200 ---- - xen/arch/x86/x86_emulate/x86_emulate.c | 3 ++- - 1 file changed, 2 insertions(+), 1 deletion(-) - -diff --git a/xen/arch/x86/x86_emulate/x86_emulate.c b/xen/arch/x86/x86_emulate/x86_emulate.c -index 441086ea861d..847f8f37719f 100644 ---- a/xen/arch/x86/x86_emulate/x86_emulate.c -+++ b/xen/arch/x86/x86_emulate/x86_emulate.c -@@ -1970,6 +1970,7 @@ amd_like(const struct x86_emulate_ctxt *ctxt) - #define vcpu_has_tbm() (ctxt->cpuid->extd.tbm) - #define vcpu_has_clzero() (ctxt->cpuid->extd.clzero) - #define vcpu_has_wbnoinvd() (ctxt->cpuid->extd.wbnoinvd) -+#define vcpu_has_nscb() (ctxt->cpuid->extd.nscb) - - #define vcpu_has_bmi1() (ctxt->cpuid->feat.bmi1) - #define vcpu_has_hle() (ctxt->cpuid->feat.hle) -@@ -2102,7 +2103,7 @@ protmode_load_seg( - case x86_seg_tr: - goto raise_exn; - } -- if ( !_amd_like(cp) || !ops->read_segment || -+ if ( !_amd_like(cp) || vcpu_has_nscb() || !ops->read_segment || - ops->read_segment(seg, sreg, ctxt) != X86EMUL_OKAY ) - memset(sreg, 0, sizeof(*sreg)); - else --- -2.37.4 - diff --git a/0030-VMX-correct-error-handling-in-vmx_create_vmcs.patch b/0030-VMX-correct-error-handling-in-vmx_create_vmcs.patch deleted file mode 100644 index e1b618d..0000000 --- a/0030-VMX-correct-error-handling-in-vmx_create_vmcs.patch +++ /dev/null @@ -1,38 +0,0 @@ -From 02ab5e97c41d275ccea0910b1d8bce41ed1be5bf Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Mon, 31 Oct 2022 13:20:40 +0100 -Subject: [PATCH 30/87] VMX: correct error handling in vmx_create_vmcs() - -With the addition of vmx_add_msr() calls to construct_vmcs() there are -now cases where simply freeing the VMCS isn't enough: The MSR bitmap -page as well as one of the MSR area ones (if it's the 2nd vmx_add_msr() -which fails) may also need freeing. Switch to using vmx_destroy_vmcs() -instead. - -Fixes: 3bd36952dab6 ("x86/spec-ctrl: Introduce an option to control L1D_FLUSH for HVM HAP guests") -Fixes: 53a570b28569 ("x86/spec-ctrl: Support IBPB-on-entry") -Reported-by: Andrew Cooper <andrew.cooper3@citrix.com> -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: Kevin Tian <kevin.tian@intel.com> -master commit: 448d28309f1a966bdc850aff1a637e0b79a03e43 -master date: 2022-10-12 17:57:56 +0200 ---- - xen/arch/x86/hvm/vmx/vmcs.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c -index dd817cee4e69..237b13459d4f 100644 ---- a/xen/arch/x86/hvm/vmx/vmcs.c -+++ b/xen/arch/x86/hvm/vmx/vmcs.c -@@ -1831,7 +1831,7 @@ int vmx_create_vmcs(struct vcpu *v) - - if ( (rc = construct_vmcs(v)) != 0 ) - { -- vmx_free_vmcs(vmx->vmcs_pa); -+ vmx_destroy_vmcs(v); - return rc; - } - --- -2.37.4 - diff --git a/0030-xen-Work-around-Clang-IAS-macro-expansion-bug.patch b/0030-xen-Work-around-Clang-IAS-macro-expansion-bug.patch new file mode 100644 index 0000000..a92f2f0 --- /dev/null +++ b/0030-xen-Work-around-Clang-IAS-macro-expansion-bug.patch @@ -0,0 +1,115 @@ +From 2b8f72a6b40dafc3fb40bce100cd62c4a377535a Mon Sep 17 00:00:00 2001 +From: Andrew Cooper <andrew.cooper3@citrix.com> +Date: Fri, 3 Mar 2023 08:14:57 +0100 +Subject: [PATCH 30/61] xen: Work around Clang-IAS macro \@ expansion bug + +https://github.com/llvm/llvm-project/issues/60792 + +It turns out that Clang-IAS does not expand \@ uniquely in a translaition +unit, and the XSA-426 change tickles this bug: + + <instantiation>:4:1: error: invalid symbol redefinition + .L1_fill_rsb_loop: + ^ + make[3]: *** [Rules.mk:247: arch/x86/acpi/cpu_idle.o] Error 1 + +Extend DO_OVERWRITE_RSB with an optional parameter so C callers can mix %= in +too, which Clang does seem to expand properly. + +Fixes: 63305e5392ec ("x86/spec-ctrl: Mitigate Cross-Thread Return Address Predictions") +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +master commit: a2adacff0b91cc7b977abb209dc419a2ef15963f +master date: 2023-02-24 17:44:29 +0000 +--- + xen/include/asm-x86/spec_ctrl.h | 4 ++-- + xen/include/asm-x86/spec_ctrl_asm.h | 23 ++++++++++++++--------- + 2 files changed, 16 insertions(+), 11 deletions(-) + +diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h +index 391973ef6a..a431fea587 100644 +--- a/xen/include/asm-x86/spec_ctrl.h ++++ b/xen/include/asm-x86/spec_ctrl.h +@@ -83,7 +83,7 @@ static always_inline void spec_ctrl_new_guest_context(void) + wrmsrl(MSR_PRED_CMD, PRED_CMD_IBPB); + + /* (ab)use alternative_input() to specify clobbers. */ +- alternative_input("", "DO_OVERWRITE_RSB", X86_BUG_IBPB_NO_RET, ++ alternative_input("", "DO_OVERWRITE_RSB xu=%=", X86_BUG_IBPB_NO_RET, + : "rax", "rcx"); + } + +@@ -172,7 +172,7 @@ static always_inline void spec_ctrl_enter_idle(struct cpu_info *info) + * + * (ab)use alternative_input() to specify clobbers. + */ +- alternative_input("", "DO_OVERWRITE_RSB", X86_FEATURE_SC_RSB_IDLE, ++ alternative_input("", "DO_OVERWRITE_RSB xu=%=", X86_FEATURE_SC_RSB_IDLE, + : "rax", "rcx"); + } + +diff --git a/xen/include/asm-x86/spec_ctrl_asm.h b/xen/include/asm-x86/spec_ctrl_asm.h +index 9eb4ad9ab7..b61a5571ae 100644 +--- a/xen/include/asm-x86/spec_ctrl_asm.h ++++ b/xen/include/asm-x86/spec_ctrl_asm.h +@@ -117,11 +117,16 @@ + .L\@_done: + .endm + +-.macro DO_OVERWRITE_RSB tmp=rax ++.macro DO_OVERWRITE_RSB tmp=rax xu + /* + * Requires nothing + * Clobbers \tmp (%rax by default), %rcx + * ++ * xu is an optional parameter to add eXtra Uniqueness. It is intended for ++ * passing %= in from an asm() block, in order to work around ++ * https://github.com/llvm/llvm-project/issues/60792 where Clang-IAS doesn't ++ * expand \@ uniquely. ++ * + * Requires 256 bytes of {,shadow}stack space, but %rsp/SSP has no net + * change. Based on Google's performance numbers, the loop is unrolled to 16 + * iterations and two calls per iteration. +@@ -137,31 +142,31 @@ + mov $16, %ecx /* 16 iterations, two calls per loop */ + mov %rsp, %\tmp /* Store the current %rsp */ + +-.L\@_fill_rsb_loop: ++.L\@_fill_rsb_loop\xu: + + .irp n, 1, 2 /* Unrolled twice. */ +- call .L\@_insert_rsb_entry_\n /* Create an RSB entry. */ ++ call .L\@_insert_rsb_entry\xu\n /* Create an RSB entry. */ + +-.L\@_capture_speculation_\n: ++.L\@_capture_speculation\xu\n: + pause + lfence +- jmp .L\@_capture_speculation_\n /* Capture rogue speculation. */ ++ jmp .L\@_capture_speculation\xu\n /* Capture rogue speculation. */ + +-.L\@_insert_rsb_entry_\n: ++.L\@_insert_rsb_entry\xu\n: + .endr + + sub $1, %ecx +- jnz .L\@_fill_rsb_loop ++ jnz .L\@_fill_rsb_loop\xu + mov %\tmp, %rsp /* Restore old %rsp */ + + #ifdef CONFIG_XEN_SHSTK + mov $1, %ecx + rdsspd %ecx + cmp $1, %ecx +- je .L\@_shstk_done ++ je .L\@_shstk_done\xu + mov $64, %ecx /* 64 * 4 bytes, given incsspd */ + incsspd %ecx /* Restore old SSP */ +-.L\@_shstk_done: ++.L\@_shstk_done\xu: + #endif + .endm + +-- +2.40.0 + diff --git a/0031-argo-Remove-reachable-ASSERT_UNREACHABLE.patch b/0031-argo-Remove-reachable-ASSERT_UNREACHABLE.patch deleted file mode 100644 index e89709d..0000000 --- a/0031-argo-Remove-reachable-ASSERT_UNREACHABLE.patch +++ /dev/null @@ -1,41 +0,0 @@ -From d4a11d6a22cf73ac7441750e5e8113779348885e Mon Sep 17 00:00:00 2001 -From: Jason Andryuk <jandryuk@gmail.com> -Date: Mon, 31 Oct 2022 13:21:31 +0100 -Subject: [PATCH 31/87] argo: Remove reachable ASSERT_UNREACHABLE - -I observed this ASSERT_UNREACHABLE in partner_rings_remove consistently -trip. It was in OpenXT with the viptables patch applied. - -dom10 shuts down. -dom7 is REJECTED sending to dom10. -dom7 shuts down and this ASSERT trips for dom10. - -The argo_send_info has a domid, but there is no refcount taken on -the domain. Therefore it's not appropriate to ASSERT that the domain -can be looked up via domid. Replace with a debug message. - -Signed-off-by: Jason Andryuk <jandryuk@gmail.com> -Reviewed-by: Christopher Clark <christopher.w.clark@gmail.com> -master commit: 197f612b77c5afe04e60df2100a855370d720ad7 -master date: 2022-10-14 14:45:41 +0100 ---- - xen/common/argo.c | 3 ++- - 1 file changed, 2 insertions(+), 1 deletion(-) - -diff --git a/xen/common/argo.c b/xen/common/argo.c -index eaea7ba8885a..80f3275092af 100644 ---- a/xen/common/argo.c -+++ b/xen/common/argo.c -@@ -1298,7 +1298,8 @@ partner_rings_remove(struct domain *src_d) - ASSERT_UNREACHABLE(); - } - else -- ASSERT_UNREACHABLE(); -+ argo_dprintk("%pd has entry for stale partner d%u\n", -+ src_d, send_info->id.domain_id); - - if ( dst_d ) - rcu_unlock_domain(dst_d); --- -2.37.4 - diff --git a/0031-xen-Fix-Clang-Wunicode-diagnostic-when-building-asm-.patch b/0031-xen-Fix-Clang-Wunicode-diagnostic-when-building-asm-.patch new file mode 100644 index 0000000..bad0316 --- /dev/null +++ b/0031-xen-Fix-Clang-Wunicode-diagnostic-when-building-asm-.patch @@ -0,0 +1,83 @@ +From f073db0a07c5f6800a70c91819c4b8c2ba359451 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper <andrew.cooper3@citrix.com> +Date: Fri, 3 Mar 2023 08:15:50 +0100 +Subject: [PATCH 31/61] xen: Fix Clang -Wunicode diagnostic when building + asm-macros + +While trying to work around a different Clang-IAS bug (parent changeset), I +stumbled onto: + + In file included from arch/x86/asm-macros.c:3: + ./arch/x86/include/asm/spec_ctrl_asm.h:144:19: error: \u used with + no following hex digits; treating as '\' followed by identifier [-Werror,-Wunicode] + .L\@_fill_rsb_loop\uniq: + ^ + +It turns out that Clang -E is sensitive to the file extension of the source +file it is processing. Furthermore, C explicitly permits the use of \u +escapes in identifier names, so the diagnostic would be reasonable in +principle if we trying to compile the result. + +asm-macros should really have been .S from the outset, as it is ultimately +generating assembly, not C. Rename it, which causes Clang not to complain. + +We need to introduce rules for generating a .i file from .S, and substituting +c_flags for a_flags lets us drop the now-redundant -D__ASSEMBLY__. + +No functional change. + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +master commit: 53f0d02040b1df08f0589f162790ca376e1c2040 +master date: 2023-02-24 17:44:29 +0000 +--- + xen/Rules.mk | 6 ++++++ + xen/arch/x86/Makefile | 2 +- + xen/arch/x86/{asm-macros.c => asm-macros.S} | 0 + 3 files changed, 7 insertions(+), 1 deletion(-) + rename xen/arch/x86/{asm-macros.c => asm-macros.S} (100%) + +diff --git a/xen/Rules.mk b/xen/Rules.mk +index 5e0699e58b..1f171f88e2 100644 +--- a/xen/Rules.mk ++++ b/xen/Rules.mk +@@ -223,6 +223,9 @@ $(filter %.init.o,$(obj-y) $(obj-bin-y) $(extra-y)): %.init.o: %.o FORCE + quiet_cmd_cpp_i_c = CPP $@ + cmd_cpp_i_c = $(CPP) $(call cpp_flags,$(c_flags)) -MQ $@ -o $@ $< + ++quiet_cmd_cpp_i_S = CPP $@ ++cmd_cpp_i_S = $(CPP) $(call cpp_flags,$(a_flags)) -MQ $@ -o $@ $< ++ + quiet_cmd_cc_s_c = CC $@ + cmd_cc_s_c = $(CC) $(filter-out -Wa$(comma)%,$(c_flags)) -S $< -o $@ + +@@ -232,6 +235,9 @@ cmd_cpp_s_S = $(CPP) $(call cpp_flags,$(a_flags)) -MQ $@ -o $@ $< + %.i: %.c FORCE + $(call if_changed,cpp_i_c) + ++%.i: %.S FORCE ++ $(call if_changed,cpp_i_S) ++ + %.s: %.c FORCE + $(call if_changed,cc_s_c) + +diff --git a/xen/arch/x86/Makefile b/xen/arch/x86/Makefile +index 69b6cfaded..8e975f472d 100644 +--- a/xen/arch/x86/Makefile ++++ b/xen/arch/x86/Makefile +@@ -273,7 +273,7 @@ efi/buildid.o efi/relocs-dummy.o: ; + .PHONY: include + include: $(BASEDIR)/include/asm-x86/asm-macros.h + +-asm-macros.i: CFLAGS-y += -D__ASSEMBLY__ -P ++asm-macros.i: CFLAGS-y += -P + + $(BASEDIR)/include/asm-x86/asm-macros.h: asm-macros.i Makefile + echo '#if 0' >$@.new +diff --git a/xen/arch/x86/asm-macros.c b/xen/arch/x86/asm-macros.S +similarity index 100% +rename from xen/arch/x86/asm-macros.c +rename to xen/arch/x86/asm-macros.S +-- +2.40.0 + diff --git a/0032-EFI-don-t-convert-memory-marked-for-runtime-use-to-o.patch b/0032-EFI-don-t-convert-memory-marked-for-runtime-use-to-o.patch deleted file mode 100644 index 33b98df..0000000 --- a/0032-EFI-don-t-convert-memory-marked-for-runtime-use-to-o.patch +++ /dev/null @@ -1,64 +0,0 @@ -From 54f8ed80c8308e65c3f57ae6cbd130f43f5ecbbd Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Mon, 31 Oct 2022 13:22:17 +0100 -Subject: [PATCH 32/87] EFI: don't convert memory marked for runtime use to - ordinary RAM -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -efi_init_memory() in both relevant places is treating EFI_MEMORY_RUNTIME -higher priority than the type of the range. To avoid accessing memory at -runtime which was re-used for other purposes, make -efi_arch_process_memory_map() follow suit. While in theory the same would -apply to EfiACPIReclaimMemory, we don't actually "reclaim" or clobber -that memory (converted to E820_ACPI on x86) there (and it would be a bug -if the Dom0 kernel tried to reclaim the range, bypassing Xen's memory -management, plus it would be at least bogus if it clobbered that space), -hence that type's handling can be left alone. - -Fixes: bf6501a62e80 ("x86-64: EFI boot code") -Fixes: facac0af87ef ("x86-64: EFI runtime code") -Fixes: 6d70ea10d49f ("Add ARM EFI boot support") -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -master commit: f324300c8347b6aa6f9c0b18e0a90bbf44011a9a -master date: 2022-10-21 12:30:24 +0200 ---- - xen/arch/arm/efi/efi-boot.h | 3 ++- - xen/arch/x86/efi/efi-boot.h | 4 +++- - 2 files changed, 5 insertions(+), 2 deletions(-) - -diff --git a/xen/arch/arm/efi/efi-boot.h b/xen/arch/arm/efi/efi-boot.h -index 9f267982397b..849071fe5308 100644 ---- a/xen/arch/arm/efi/efi-boot.h -+++ b/xen/arch/arm/efi/efi-boot.h -@@ -194,7 +194,8 @@ static EFI_STATUS __init efi_process_memory_map_bootinfo(EFI_MEMORY_DESCRIPTOR * - - for ( Index = 0; Index < (mmap_size / desc_size); Index++ ) - { -- if ( desc_ptr->Attribute & EFI_MEMORY_WB && -+ if ( !(desc_ptr->Attribute & EFI_MEMORY_RUNTIME) && -+ (desc_ptr->Attribute & EFI_MEMORY_WB) && - (desc_ptr->Type == EfiConventionalMemory || - desc_ptr->Type == EfiLoaderCode || - desc_ptr->Type == EfiLoaderData || -diff --git a/xen/arch/x86/efi/efi-boot.h b/xen/arch/x86/efi/efi-boot.h -index 4ee77fb9bfa2..d99601622310 100644 ---- a/xen/arch/x86/efi/efi-boot.h -+++ b/xen/arch/x86/efi/efi-boot.h -@@ -185,7 +185,9 @@ static void __init efi_arch_process_memory_map(EFI_SYSTEM_TABLE *SystemTable, - /* fall through */ - case EfiLoaderCode: - case EfiLoaderData: -- if ( desc->Attribute & EFI_MEMORY_WB ) -+ if ( desc->Attribute & EFI_MEMORY_RUNTIME ) -+ type = E820_RESERVED; -+ else if ( desc->Attribute & EFI_MEMORY_WB ) - type = E820_RAM; - else - case EfiUnusableMemory: --- -2.37.4 - diff --git a/0032-tools-Use-PKG_CONFIG_FILE-instead-of-PKG_CONFIG-vari.patch b/0032-tools-Use-PKG_CONFIG_FILE-instead-of-PKG_CONFIG-vari.patch new file mode 100644 index 0000000..bfcdd26 --- /dev/null +++ b/0032-tools-Use-PKG_CONFIG_FILE-instead-of-PKG_CONFIG-vari.patch @@ -0,0 +1,98 @@ +From a2adc7fcc22405e81dc11290416e6140bb0244ca Mon Sep 17 00:00:00 2001 +From: Bertrand Marquis <bertrand.marquis@arm.com> +Date: Fri, 3 Mar 2023 08:16:45 +0100 +Subject: [PATCH 32/61] tools: Use PKG_CONFIG_FILE instead of PKG_CONFIG + variable + +Replace PKG_CONFIG variable name with PKG_CONFIG_FILE for the name of +the pkg-config file. +This is preventing a conflict in some build systems where PKG_CONFIG +actually contains the path to the pkg-config executable to use, as the +default assignment in libs.mk is using a weak assignment (?=). + +This problem has been found when trying to build the latest version of +Xen tools using buildroot. + +Fixes: d400dc5729e4 ("tools: tweak tools/libs/libs.mk for being able to support libxenctrl") +Signed-off-by: Bertrand Marquis <bertrand.marquis@arm.com> +Reviewed-by: Anthony PERARD <anthony.perard@citrix.com> +master commit: b97e2fe7b9e1f4706693552697239ac2b71efee4 +master date: 2023-02-24 17:44:29 +0000 +--- + tools/libs/ctrl/Makefile | 2 +- + tools/libs/libs.mk | 13 +++++++------ + 2 files changed, 8 insertions(+), 7 deletions(-) + +diff --git a/tools/libs/ctrl/Makefile b/tools/libs/ctrl/Makefile +index 6ff5918798..d3666ae7ff 100644 +--- a/tools/libs/ctrl/Makefile ++++ b/tools/libs/ctrl/Makefile +@@ -47,7 +47,7 @@ CFLAGS += -include $(XEN_ROOT)/tools/config.h + CFLAGS-$(CONFIG_Linux) += -D_GNU_SOURCE + + LIBHEADER := xenctrl.h xenctrl_compat.h +-PKG_CONFIG := xencontrol.pc ++PKG_CONFIG_FILE := xencontrol.pc + PKG_CONFIG_NAME := Xencontrol + + NO_HEADERS_CHK := y +diff --git a/tools/libs/libs.mk b/tools/libs/libs.mk +index f1554462fb..0e005218e2 100644 +--- a/tools/libs/libs.mk ++++ b/tools/libs/libs.mk +@@ -1,7 +1,7 @@ + # Common Makefile for building a lib. + # + # Variables taken as input: +-# PKG_CONFIG: name of pkg-config file (xen$(LIBNAME).pc if empty) ++# PKG_CONFIG_FILE: name of pkg-config file (xen$(LIBNAME).pc if empty) + # MAJOR: major version of lib (Xen version if empty) + # MINOR: minor version of lib (0 if empty) + +@@ -29,7 +29,8 @@ endif + comma:= , + empty:= + space:= $(empty) $(empty) +-PKG_CONFIG ?= $(LIB_FILE_NAME).pc ++ ++PKG_CONFIG_FILE ?= $(LIB_FILE_NAME).pc + PKG_CONFIG_NAME ?= Xen$(LIBNAME) + PKG_CONFIG_DESC ?= The $(PKG_CONFIG_NAME) library for Xen hypervisor + PKG_CONFIG_VERSION := $(MAJOR).$(MINOR) +@@ -38,13 +39,13 @@ PKG_CONFIG_LIB := $(LIB_FILE_NAME) + PKG_CONFIG_REQPRIV := $(subst $(space),$(comma),$(strip $(foreach lib,$(patsubst ctrl,control,$(USELIBS_$(LIBNAME))),xen$(lib)))) + + ifneq ($(CONFIG_LIBXC_MINIOS),y) +-PKG_CONFIG_INST := $(PKG_CONFIG) ++PKG_CONFIG_INST := $(PKG_CONFIG_FILE) + $(PKG_CONFIG_INST): PKG_CONFIG_PREFIX = $(prefix) + $(PKG_CONFIG_INST): PKG_CONFIG_INCDIR = $(includedir) + $(PKG_CONFIG_INST): PKG_CONFIG_LIBDIR = $(libdir) + endif + +-PKG_CONFIG_LOCAL := $(PKG_CONFIG_DIR)/$(PKG_CONFIG) ++PKG_CONFIG_LOCAL := $(PKG_CONFIG_DIR)/$(PKG_CONFIG_FILE) + + LIBHEADER ?= $(LIB_FILE_NAME).h + LIBHEADERS = $(foreach h, $(LIBHEADER), $(XEN_INCLUDE)/$(h)) +@@ -114,7 +115,7 @@ install: build + $(SYMLINK_SHLIB) lib$(LIB_FILE_NAME).so.$(MAJOR).$(MINOR) $(DESTDIR)$(libdir)/lib$(LIB_FILE_NAME).so.$(MAJOR) + $(SYMLINK_SHLIB) lib$(LIB_FILE_NAME).so.$(MAJOR) $(DESTDIR)$(libdir)/lib$(LIB_FILE_NAME).so + for i in $(LIBHEADERS); do $(INSTALL_DATA) $$i $(DESTDIR)$(includedir); done +- $(INSTALL_DATA) $(PKG_CONFIG) $(DESTDIR)$(PKG_INSTALLDIR) ++ $(INSTALL_DATA) $(PKG_CONFIG_FILE) $(DESTDIR)$(PKG_INSTALLDIR) + + .PHONY: uninstall + uninstall: +@@ -134,7 +135,7 @@ clean: + rm -rf *.rpm $(LIB) *~ $(DEPS_RM) $(LIB_OBJS) $(PIC_OBJS) + rm -f lib$(LIB_FILE_NAME).so.$(MAJOR).$(MINOR) lib$(LIB_FILE_NAME).so.$(MAJOR) + rm -f headers.chk headers.lst +- rm -f $(PKG_CONFIG) ++ rm -f $(PKG_CONFIG_FILE) + rm -f _paths.h + + .PHONY: distclean +-- +2.40.0 + diff --git a/0033-libs-guest-Fix-resource-leaks-in-xc_core_arch_map_p2.patch b/0033-libs-guest-Fix-resource-leaks-in-xc_core_arch_map_p2.patch new file mode 100644 index 0000000..5caa850 --- /dev/null +++ b/0033-libs-guest-Fix-resource-leaks-in-xc_core_arch_map_p2.patch @@ -0,0 +1,65 @@ +From b181a3a5532574d2163408284bcd785ec87fe046 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper <andrew.cooper3@citrix.com> +Date: Fri, 3 Mar 2023 08:17:04 +0100 +Subject: [PATCH 33/61] libs/guest: Fix resource leaks in + xc_core_arch_map_p2m_tree_rw() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Edwin, with the help of GCC's -fanalyzer, identified that p2m_frame_list_list +gets leaked. What fanalyzer can't see is that the live_p2m_frame_list_list +and live_p2m_frame_list foreign mappings are leaked too. + +Rework the logic so the out path is executed unconditionally, which cleans up +all the intermediate allocations/mappings appropriately. + +Fixes: bd7a29c3d0b9 ("tools/libs/ctrl: fix xc_core_arch_map_p2m() to support linear p2m table") +Reported-by: Edwin Török <edwin.torok@cloud.com> +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Juergen Gross <jgross@suse.com> +master commit: 1868d7f22660c8980bd0a7e53f044467e8b63bb5 +master date: 2023-02-27 15:51:23 +0000 +--- + tools/libs/guest/xg_core_x86.c | 8 +++----- + 1 file changed, 3 insertions(+), 5 deletions(-) + +diff --git a/tools/libs/guest/xg_core_x86.c b/tools/libs/guest/xg_core_x86.c +index 61106b98b8..c5e4542ccc 100644 +--- a/tools/libs/guest/xg_core_x86.c ++++ b/tools/libs/guest/xg_core_x86.c +@@ -229,11 +229,11 @@ xc_core_arch_map_p2m_tree_rw(xc_interface *xch, struct domain_info_context *dinf + uint32_t dom, shared_info_any_t *live_shinfo) + { + /* Double and single indirect references to the live P2M table */ +- xen_pfn_t *live_p2m_frame_list_list; ++ xen_pfn_t *live_p2m_frame_list_list = NULL; + xen_pfn_t *live_p2m_frame_list = NULL; + /* Copies of the above. */ + xen_pfn_t *p2m_frame_list_list = NULL; +- xen_pfn_t *p2m_frame_list; ++ xen_pfn_t *p2m_frame_list = NULL; + + int err; + int i; +@@ -297,8 +297,6 @@ xc_core_arch_map_p2m_tree_rw(xc_interface *xch, struct domain_info_context *dinf + + dinfo->p2m_frames = P2M_FL_ENTRIES; + +- return p2m_frame_list; +- + out: + err = errno; + +@@ -312,7 +310,7 @@ xc_core_arch_map_p2m_tree_rw(xc_interface *xch, struct domain_info_context *dinf + + errno = err; + +- return NULL; ++ return p2m_frame_list; + } + + static int +-- +2.40.0 + diff --git a/0033-xen-sched-fix-race-in-RTDS-scheduler.patch b/0033-xen-sched-fix-race-in-RTDS-scheduler.patch deleted file mode 100644 index 93ee04b..0000000 --- a/0033-xen-sched-fix-race-in-RTDS-scheduler.patch +++ /dev/null @@ -1,42 +0,0 @@ -From 481465f35da1bcec0b2a4dfd6fc51d86cac28547 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Mon, 31 Oct 2022 13:22:54 +0100 -Subject: [PATCH 33/87] xen/sched: fix race in RTDS scheduler - -When a domain gets paused the unit runnable state can change to "not -runnable" without the scheduling lock being involved. This means that -a specific scheduler isn't involved in this change of runnable state. - -In the RTDS scheduler this can result in an inconsistency in case a -unit is losing its "runnable" capability while the RTDS scheduler's -scheduling function is active. RTDS will remove the unit from the run -queue, but doesn't do so for the replenish queue, leading to hitting -an ASSERT() in replq_insert() later when the domain is unpaused again. - -Fix that by removing the unit from the replenish queue as well in this -case. - -Fixes: 7c7b407e7772 ("xen/sched: introduce unit_runnable_state()") -Signed-off-by: Juergen Gross <jgross@suse.com> -Acked-by: Dario Faggioli <dfaggioli@suse.com> -master commit: 73c62927f64ecb48f27d06176befdf76b879f340 -master date: 2022-10-21 12:32:23 +0200 ---- - xen/common/sched/rt.c | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/xen/common/sched/rt.c b/xen/common/sched/rt.c -index c24cd2ac3200..ec2ca1bebc26 100644 ---- a/xen/common/sched/rt.c -+++ b/xen/common/sched/rt.c -@@ -1087,6 +1087,7 @@ rt_schedule(const struct scheduler *ops, struct sched_unit *currunit, - else if ( !unit_runnable_state(snext->unit) ) - { - q_remove(snext); -+ replq_remove(ops, snext); - snext = rt_unit(sched_idle_unit(sched_cpu)); - } - --- -2.37.4 - diff --git a/0034-libs-guest-Fix-leak-on-realloc-failure-in-backup_pte.patch b/0034-libs-guest-Fix-leak-on-realloc-failure-in-backup_pte.patch new file mode 100644 index 0000000..4be16a3 --- /dev/null +++ b/0034-libs-guest-Fix-leak-on-realloc-failure-in-backup_pte.patch @@ -0,0 +1,56 @@ +From 25d103f2eb59f021cce61f07a0bf0bfa696b4416 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= <edwin.torok@cloud.com> +Date: Fri, 3 Mar 2023 08:17:23 +0100 +Subject: [PATCH 34/61] libs/guest: Fix leak on realloc failure in + backup_ptes() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From `man 2 realloc`: + + If realloc() fails, the original block is left untouched; it is not freed or moved. + +Found using GCC -fanalyzer: + + | 184 | backup->entries = realloc(backup->entries, + | | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + | | | | | + | | | | (91) when ‘realloc’ fails + | | | (92) ‘old_ptes.entries’ leaks here; was allocated at (44) + | | (90) ...to here + +Signed-off-by: Edwin Török <edwin.torok@cloud.com> +Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> +master commit: 275d13184cfa52ebe4336ed66526ce93716adbe0 +master date: 2023-02-27 15:51:23 +0000 +--- + tools/libs/guest/xg_offline_page.c | 10 ++++++++-- + 1 file changed, 8 insertions(+), 2 deletions(-) + +diff --git a/tools/libs/guest/xg_offline_page.c b/tools/libs/guest/xg_offline_page.c +index cfe0e2d537..c42b973363 100644 +--- a/tools/libs/guest/xg_offline_page.c ++++ b/tools/libs/guest/xg_offline_page.c +@@ -181,10 +181,16 @@ static int backup_ptes(xen_pfn_t table_mfn, int offset, + + if (backup->max == backup->cur) + { +- backup->entries = realloc(backup->entries, +- backup->max * 2 * sizeof(struct pte_backup_entry)); ++ void *orig = backup->entries; ++ ++ backup->entries = realloc( ++ orig, backup->max * 2 * sizeof(struct pte_backup_entry)); ++ + if (backup->entries == NULL) ++ { ++ free(orig); + return -1; ++ } + else + backup->max *= 2; + } +-- +2.40.0 + diff --git a/0034-xen-sched-fix-restore_vcpu_affinity-by-removing-it.patch b/0034-xen-sched-fix-restore_vcpu_affinity-by-removing-it.patch deleted file mode 100644 index eecec07..0000000 --- a/0034-xen-sched-fix-restore_vcpu_affinity-by-removing-it.patch +++ /dev/null @@ -1,158 +0,0 @@ -From 88f2bf5de9ad789e1c61b5d5ecf118909eed6917 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Mon, 31 Oct 2022 13:23:50 +0100 -Subject: [PATCH 34/87] xen/sched: fix restore_vcpu_affinity() by removing it -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -When the system is coming up after having been suspended, -restore_vcpu_affinity() is called for each domain in order to adjust -the vcpu's affinity settings in case a cpu didn't come to live again. - -The way restore_vcpu_affinity() is doing that is wrong, because the -specific scheduler isn't being informed about a possible migration of -the vcpu to another cpu. Additionally the migration is often even -happening if all cpus are running again, as it is done without check -whether it is really needed. - -As cpupool management is already calling cpu_disable_scheduler() for -cpus not having come up again, and cpu_disable_scheduler() is taking -care of eventually needed vcpu migration in the proper way, there is -simply no need for restore_vcpu_affinity(). - -So just remove restore_vcpu_affinity() completely, together with the -no longer used sched_reset_affinity_broken(). - -Fixes: 8a04eaa8ea83 ("xen/sched: move some per-vcpu items to struct sched_unit") -Reported-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com> -Signed-off-by: Juergen Gross <jgross@suse.com> -Acked-by: Dario Faggioli <dfaggioli@suse.com> -Tested-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com> -master commit: fce1f381f7388daaa3e96dbb0d67d7a3e4bb2d2d -master date: 2022-10-24 11:16:27 +0100 ---- - xen/arch/x86/acpi/power.c | 3 -- - xen/common/sched/core.c | 78 --------------------------------------- - xen/include/xen/sched.h | 1 - - 3 files changed, 82 deletions(-) - -diff --git a/xen/arch/x86/acpi/power.c b/xen/arch/x86/acpi/power.c -index dd397f713067..1a7baeebe6d0 100644 ---- a/xen/arch/x86/acpi/power.c -+++ b/xen/arch/x86/acpi/power.c -@@ -159,10 +159,7 @@ static void thaw_domains(void) - - rcu_read_lock(&domlist_read_lock); - for_each_domain ( d ) -- { -- restore_vcpu_affinity(d); - domain_unpause(d); -- } - rcu_read_unlock(&domlist_read_lock); - } - -diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c -index 900aab8f66a7..9173cf690c72 100644 ---- a/xen/common/sched/core.c -+++ b/xen/common/sched/core.c -@@ -1188,84 +1188,6 @@ static bool sched_check_affinity_broken(const struct sched_unit *unit) - return false; - } - --static void sched_reset_affinity_broken(const struct sched_unit *unit) --{ -- struct vcpu *v; -- -- for_each_sched_unit_vcpu ( unit, v ) -- v->affinity_broken = false; --} -- --void restore_vcpu_affinity(struct domain *d) --{ -- unsigned int cpu = smp_processor_id(); -- struct sched_unit *unit; -- -- ASSERT(system_state == SYS_STATE_resume); -- -- rcu_read_lock(&sched_res_rculock); -- -- for_each_sched_unit ( d, unit ) -- { -- spinlock_t *lock; -- unsigned int old_cpu = sched_unit_master(unit); -- struct sched_resource *res; -- -- ASSERT(!unit_runnable(unit)); -- -- /* -- * Re-assign the initial processor as after resume we have no -- * guarantee the old processor has come back to life again. -- * -- * Therefore, here, before actually unpausing the domains, we should -- * set v->processor of each of their vCPUs to something that will -- * make sense for the scheduler of the cpupool in which they are in. -- */ -- lock = unit_schedule_lock_irq(unit); -- -- cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity, -- cpupool_domain_master_cpumask(d)); -- if ( cpumask_empty(cpumask_scratch_cpu(cpu)) ) -- { -- if ( sched_check_affinity_broken(unit) ) -- { -- sched_set_affinity(unit, unit->cpu_hard_affinity_saved, NULL); -- sched_reset_affinity_broken(unit); -- cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity, -- cpupool_domain_master_cpumask(d)); -- } -- -- if ( cpumask_empty(cpumask_scratch_cpu(cpu)) ) -- { -- /* Affinity settings of one vcpu are for the complete unit. */ -- printk(XENLOG_DEBUG "Breaking affinity for %pv\n", -- unit->vcpu_list); -- sched_set_affinity(unit, &cpumask_all, NULL); -- cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity, -- cpupool_domain_master_cpumask(d)); -- } -- } -- -- res = get_sched_res(cpumask_any(cpumask_scratch_cpu(cpu))); -- sched_set_res(unit, res); -- -- spin_unlock_irq(lock); -- -- /* v->processor might have changed, so reacquire the lock. */ -- lock = unit_schedule_lock_irq(unit); -- res = sched_pick_resource(unit_scheduler(unit), unit); -- sched_set_res(unit, res); -- spin_unlock_irq(lock); -- -- if ( old_cpu != sched_unit_master(unit) ) -- sched_move_irqs(unit); -- } -- -- rcu_read_unlock(&sched_res_rculock); -- -- domain_update_node_affinity(d); --} -- - /* - * This function is used by cpu_hotplug code via cpu notifier chain - * and from cpupools to switch schedulers on a cpu. -diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h -index 3f4225738a40..1a1fab5239ec 100644 ---- a/xen/include/xen/sched.h -+++ b/xen/include/xen/sched.h -@@ -999,7 +999,6 @@ void vcpu_set_periodic_timer(struct vcpu *v, s_time_t value); - void sched_setup_dom0_vcpus(struct domain *d); - int vcpu_temporary_affinity(struct vcpu *v, unsigned int cpu, uint8_t reason); - int vcpu_set_hard_affinity(struct vcpu *v, const cpumask_t *affinity); --void restore_vcpu_affinity(struct domain *d); - int vcpu_affinity_domctl(struct domain *d, uint32_t cmd, - struct xen_domctl_vcpuaffinity *vcpuaff); - --- -2.37.4 - diff --git a/0035-x86-shadow-drop-replace-bogus-assertions.patch b/0035-x86-shadow-drop-replace-bogus-assertions.patch deleted file mode 100644 index 55e9f62..0000000 --- a/0035-x86-shadow-drop-replace-bogus-assertions.patch +++ /dev/null @@ -1,71 +0,0 @@ -From 9fdb4f17656f74b35af0882b558e44832ff00b5f Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Mon, 31 Oct 2022 13:24:33 +0100 -Subject: [PATCH 35/87] x86/shadow: drop (replace) bogus assertions -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -The addition of a call to shadow_blow_tables() from shadow_teardown() -has resulted in the "no vcpus" related assertion becoming triggerable: -If domain_create() fails with at least one page successfully allocated -in the course of shadow_enable(), or if domain_create() succeeds and -the domain is then killed without ever invoking XEN_DOMCTL_max_vcpus. -Note that in-tree tests (test-resource and test-tsx) do exactly the -latter of these two. - -The assertion's comment was bogus anyway: Shadow mode has been getting -enabled before allocation of vCPU-s for quite some time. Convert the -assertion to a conditional: As long as there are no vCPU-s, there's -nothing to blow away. - -Fixes: e7aa55c0aab3 ("x86/p2m: free the paging memory pool preemptively") -Reported-by: Andrew Cooper <andrew.cooper3@citrix.com> - -A similar assertion/comment pair exists in _shadow_prealloc(); the -comment is similarly bogus, and the assertion could in principle trigger -e.g. when shadow_alloc_p2m_page() is called early enough. Replace those -at the same time by a similar early return, here indicating failure to -the caller (which will generally lead to the domain being crashed in -shadow_prealloc()). - -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Roger Pau Monné <roger.pau@citrix.com> -Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> -master commit: a92dc2bb30ba65ae25d2f417677eb7ef9a6a0fef -master date: 2022-10-24 15:46:11 +0200 ---- - xen/arch/x86/mm/shadow/common.c | 10 ++++++---- - 1 file changed, 6 insertions(+), 4 deletions(-) - -diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c -index 3b0d781991b5..1de0139742f7 100644 ---- a/xen/arch/x86/mm/shadow/common.c -+++ b/xen/arch/x86/mm/shadow/common.c -@@ -943,8 +943,9 @@ static bool __must_check _shadow_prealloc(struct domain *d, unsigned int pages) - /* No reclaim when the domain is dying, teardown will take care of it. */ - return false; - -- /* Shouldn't have enabled shadows if we've no vcpus. */ -- ASSERT(d->vcpu && d->vcpu[0]); -+ /* Nothing to reclaim when there are no vcpus yet. */ -+ if ( !d->vcpu[0] ) -+ return false; - - /* Stage one: walk the list of pinned pages, unpinning them */ - perfc_incr(shadow_prealloc_1); -@@ -1034,8 +1035,9 @@ void shadow_blow_tables(struct domain *d) - mfn_t smfn; - int i; - -- /* Shouldn't have enabled shadows if we've no vcpus. */ -- ASSERT(d->vcpu && d->vcpu[0]); -+ /* Nothing to do when there are no vcpus yet. */ -+ if ( !d->vcpu[0] ) -+ return; - - /* Pass one: unpin all pinned pages */ - foreach_pinned_shadow(d, sp, t) --- -2.37.4 - diff --git a/0035-x86-ucode-AMD-late-load-the-patch-on-every-logical-t.patch b/0035-x86-ucode-AMD-late-load-the-patch-on-every-logical-t.patch new file mode 100644 index 0000000..931d93f --- /dev/null +++ b/0035-x86-ucode-AMD-late-load-the-patch-on-every-logical-t.patch @@ -0,0 +1,90 @@ +From 84dfe7a56f04a7412fa4869b3e756c49e1cfbe75 Mon Sep 17 00:00:00 2001 +From: Sergey Dyasli <sergey.dyasli@citrix.com> +Date: Fri, 3 Mar 2023 08:17:40 +0100 +Subject: [PATCH 35/61] x86/ucode/AMD: late load the patch on every logical + thread + +Currently late ucode loading is performed only on the first core of CPU +siblings. But according to the latest recommendation from AMD, late +ucode loading should happen on every logical thread/core on AMD CPUs. + +To achieve that, introduce is_cpu_primary() helper which will consider +every logical cpu as "primary" when running on AMD CPUs. Also include +Hygon in the check for future-proofing. + +Signed-off-by: Sergey Dyasli <sergey.dyasli@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +master commit: f1315e48a03a42f78f9b03c0a384165baf02acae +master date: 2023-02-28 14:51:28 +0100 +--- + xen/arch/x86/cpu/microcode/core.c | 24 +++++++++++++++++++----- + 1 file changed, 19 insertions(+), 5 deletions(-) + +diff --git a/xen/arch/x86/cpu/microcode/core.c b/xen/arch/x86/cpu/microcode/core.c +index ceec1f1edc..ee7df9a591 100644 +--- a/xen/arch/x86/cpu/microcode/core.c ++++ b/xen/arch/x86/cpu/microcode/core.c +@@ -273,6 +273,20 @@ static bool microcode_update_cache(struct microcode_patch *patch) + return true; + } + ++/* Returns true if ucode should be loaded on a given cpu */ ++static bool is_cpu_primary(unsigned int cpu) ++{ ++ if ( boot_cpu_data.x86_vendor & (X86_VENDOR_AMD | X86_VENDOR_HYGON) ) ++ /* Load ucode on every logical thread/core */ ++ return true; ++ ++ /* Intel CPUs should load ucode only on the first core of SMT siblings */ ++ if ( cpu == cpumask_first(per_cpu(cpu_sibling_mask, cpu)) ) ++ return true; ++ ++ return false; ++} ++ + /* Wait for a condition to be met with a timeout (us). */ + static int wait_for_condition(bool (*func)(unsigned int data), + unsigned int data, unsigned int timeout) +@@ -378,7 +392,7 @@ static int primary_thread_work(const struct microcode_patch *patch) + + static int microcode_nmi_callback(const struct cpu_user_regs *regs, int cpu) + { +- unsigned int primary = cpumask_first(this_cpu(cpu_sibling_mask)); ++ bool primary_cpu = is_cpu_primary(cpu); + int ret; + + /* System-generated NMI, leave to main handler */ +@@ -391,10 +405,10 @@ static int microcode_nmi_callback(const struct cpu_user_regs *regs, int cpu) + * ucode_in_nmi. + */ + if ( cpu == cpumask_first(&cpu_online_map) || +- (!ucode_in_nmi && cpu == primary) ) ++ (!ucode_in_nmi && primary_cpu) ) + return 0; + +- if ( cpu == primary ) ++ if ( primary_cpu ) + ret = primary_thread_work(nmi_patch); + else + ret = secondary_nmi_work(); +@@ -545,7 +559,7 @@ static int do_microcode_update(void *patch) + */ + if ( cpu == cpumask_first(&cpu_online_map) ) + ret = control_thread_fn(patch); +- else if ( cpu == cpumask_first(this_cpu(cpu_sibling_mask)) ) ++ else if ( is_cpu_primary(cpu) ) + ret = primary_thread_fn(patch); + else + ret = secondary_thread_fn(); +@@ -637,7 +651,7 @@ static long microcode_update_helper(void *data) + /* Calculate the number of online CPU core */ + nr_cores = 0; + for_each_online_cpu(cpu) +- if ( cpu == cpumask_first(per_cpu(cpu_sibling_mask, cpu)) ) ++ if ( is_cpu_primary(cpu) ) + nr_cores++; + + printk(XENLOG_INFO "%u cores are to update their microcode\n", nr_cores); +-- +2.40.0 + diff --git a/0036-vpci-don-t-assume-that-vpci-per-device-data-exists-u.patch b/0036-vpci-don-t-assume-that-vpci-per-device-data-exists-u.patch deleted file mode 100644 index ab8f792..0000000 --- a/0036-vpci-don-t-assume-that-vpci-per-device-data-exists-u.patch +++ /dev/null @@ -1,60 +0,0 @@ -From 96d26f11f56e83b98ec184f4e0d17161efe3a927 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Mon, 31 Oct 2022 13:25:13 +0100 -Subject: [PATCH 36/87] vpci: don't assume that vpci per-device data exists - unconditionally -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -It's possible for a device to be assigned to a domain but have no -vpci structure if vpci_process_pending() failed and called -vpci_remove_device() as a result. The unconditional accesses done by -vpci_{read,write}() and vpci_remove_device() to pdev->vpci would -then trigger a NULL pointer dereference. - -Add checks for pdev->vpci presence in the affected functions. - -Fixes: 9c244fdef7 ('vpci: add header handlers') -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -master commit: 6ccb5e308ceeb895fbccd87a528a8bd24325aa39 -master date: 2022-10-26 14:55:30 +0200 ---- - xen/drivers/vpci/vpci.c | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c -index dfc8136ffb95..53d78d53911d 100644 ---- a/xen/drivers/vpci/vpci.c -+++ b/xen/drivers/vpci/vpci.c -@@ -37,7 +37,7 @@ extern vpci_register_init_t *const __end_vpci_array[]; - - void vpci_remove_device(struct pci_dev *pdev) - { -- if ( !has_vpci(pdev->domain) ) -+ if ( !has_vpci(pdev->domain) || !pdev->vpci ) - return; - - spin_lock(&pdev->vpci->lock); -@@ -326,7 +326,7 @@ uint32_t vpci_read(pci_sbdf_t sbdf, unsigned int reg, unsigned int size) - - /* Find the PCI dev matching the address. */ - pdev = pci_get_pdev_by_domain(d, sbdf.seg, sbdf.bus, sbdf.devfn); -- if ( !pdev ) -+ if ( !pdev || !pdev->vpci ) - return vpci_read_hw(sbdf, reg, size); - - spin_lock(&pdev->vpci->lock); -@@ -436,7 +436,7 @@ void vpci_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int size, - * Passthrough everything that's not trapped. - */ - pdev = pci_get_pdev_by_domain(d, sbdf.seg, sbdf.bus, sbdf.devfn); -- if ( !pdev ) -+ if ( !pdev || !pdev->vpci ) - { - vpci_write_hw(sbdf, reg, size, data); - return; --- -2.37.4 - diff --git a/0036-x86-shadow-account-for-log-dirty-mode-when-pre-alloc.patch b/0036-x86-shadow-account-for-log-dirty-mode-when-pre-alloc.patch new file mode 100644 index 0000000..38629a4 --- /dev/null +++ b/0036-x86-shadow-account-for-log-dirty-mode-when-pre-alloc.patch @@ -0,0 +1,92 @@ +From b0d6684ee58f7252940f5a62e4b85bdc56307eef Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Tue, 21 Mar 2023 11:59:44 +0000 +Subject: [PATCH 36/61] x86/shadow: account for log-dirty mode when + pre-allocating + +Pre-allocation is intended to ensure that in the course of constructing +or updating shadows there won't be any risk of just made shadows or +shadows being acted upon can disappear under our feet. The amount of +pages pre-allocated then, however, needs to account for all possible +subsequent allocations. While the use in sh_page_fault() accounts for +all shadows which may need making, so far it didn't account for +allocations coming from log-dirty tracking (which piggybacks onto the +P2M allocation functions). + +Since shadow_prealloc() takes a count of shadows (or other data +structures) rather than a count of pages, putting the adjustment at the +call site of this function won't work very well: We simply can't express +the correct count that way in all cases. Instead take care of this in +the function itself, by "snooping" for L1 type requests. (While not +applicable right now, future new request sites of L1 tables would then +also be covered right away.) + +It is relevant to note here that pre-allocations like the one done from +shadow_alloc_p2m_page() are benign when they fall in the "scope" of an +earlier pre-alloc which already included that count: The inner call will +simply find enough pages available then; it'll bail right away. + +This is CVE-2022-42332 / XSA-427. + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Tim Deegan <tim@xen.org> +(cherry picked from commit 91767a71061035ae42be93de495cd976f863a41a) +--- + xen/arch/x86/mm/paging.c | 1 + + xen/arch/x86/mm/shadow/common.c | 12 +++++++++++- + xen/include/asm-x86/paging.h | 4 ++++ + 3 files changed, 16 insertions(+), 1 deletion(-) + +diff --git a/xen/arch/x86/mm/paging.c b/xen/arch/x86/mm/paging.c +index 97ac9ccf59..9fb66e65cd 100644 +--- a/xen/arch/x86/mm/paging.c ++++ b/xen/arch/x86/mm/paging.c +@@ -280,6 +280,7 @@ void paging_mark_pfn_dirty(struct domain *d, pfn_t pfn) + if ( unlikely(!VALID_M2P(pfn_x(pfn))) ) + return; + ++ BUILD_BUG_ON(paging_logdirty_levels() != 4); + i1 = L1_LOGDIRTY_IDX(pfn); + i2 = L2_LOGDIRTY_IDX(pfn); + i3 = L3_LOGDIRTY_IDX(pfn); +diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c +index 1de0139742..c14a269935 100644 +--- a/xen/arch/x86/mm/shadow/common.c ++++ b/xen/arch/x86/mm/shadow/common.c +@@ -1015,7 +1015,17 @@ bool shadow_prealloc(struct domain *d, unsigned int type, unsigned int count) + if ( unlikely(d->is_dying) ) + return false; + +- ret = _shadow_prealloc(d, shadow_size(type) * count); ++ count *= shadow_size(type); ++ /* ++ * Log-dirty handling may result in allocations when populating its ++ * tracking structures. Tie this to the caller requesting space for L1 ++ * shadows. ++ */ ++ if ( paging_mode_log_dirty(d) && ++ ((SHF_L1_ANY | SHF_FL1_ANY) & (1u << type)) ) ++ count += paging_logdirty_levels(); ++ ++ ret = _shadow_prealloc(d, count); + if ( !ret && (!d->is_shutting_down || d->shutdown_code != SHUTDOWN_crash) ) + /* + * Failing to allocate memory required for shadow usage can only result in +diff --git a/xen/include/asm-x86/paging.h b/xen/include/asm-x86/paging.h +index 27890791d8..c6b429c691 100644 +--- a/xen/include/asm-x86/paging.h ++++ b/xen/include/asm-x86/paging.h +@@ -192,6 +192,10 @@ int paging_mfn_is_dirty(struct domain *d, mfn_t gmfn); + #define L4_LOGDIRTY_IDX(pfn) ((pfn_x(pfn) >> (PAGE_SHIFT + 3 + PAGETABLE_ORDER * 2)) & \ + (LOGDIRTY_NODE_ENTRIES-1)) + ++#define paging_logdirty_levels() \ ++ (DIV_ROUND_UP(PADDR_BITS - PAGE_SHIFT - (PAGE_SHIFT + 3), \ ++ PAGE_SHIFT - ilog2(sizeof(mfn_t))) + 1) ++ + #ifdef CONFIG_HVM + /* VRAM dirty tracking support */ + struct sh_dirty_vram { +-- +2.40.0 + diff --git a/0037-vpci-msix-remove-from-table-list-on-detach.patch b/0037-vpci-msix-remove-from-table-list-on-detach.patch deleted file mode 100644 index 2bae0a2..0000000 --- a/0037-vpci-msix-remove-from-table-list-on-detach.patch +++ /dev/null @@ -1,47 +0,0 @@ -From 8f3f8f20de5cea704671d4ca83f2dceb93ab98d8 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Mon, 31 Oct 2022 13:25:40 +0100 -Subject: [PATCH 37/87] vpci/msix: remove from table list on detach -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Teardown of MSIX vPCI related data doesn't currently remove the MSIX -device data from the list of MSIX tables handled by the domain, -leading to a use-after-free of the data in the msix structure. - -Remove the structure from the list before freeing in order to solve -it. - -Reported-by: Jan Beulich <jbeulich@suse.com> -Fixes: d6281be9d0 ('vpci/msix: add MSI-X handlers') -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -master commit: c14aea137eab29eb9c30bfad745a00c65ad21066 -master date: 2022-10-26 14:56:58 +0200 ---- - xen/drivers/vpci/vpci.c | 8 ++++++-- - 1 file changed, 6 insertions(+), 2 deletions(-) - -diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c -index 53d78d53911d..b9339f8f3e43 100644 ---- a/xen/drivers/vpci/vpci.c -+++ b/xen/drivers/vpci/vpci.c -@@ -51,8 +51,12 @@ void vpci_remove_device(struct pci_dev *pdev) - xfree(r); - } - spin_unlock(&pdev->vpci->lock); -- if ( pdev->vpci->msix && pdev->vpci->msix->pba ) -- iounmap(pdev->vpci->msix->pba); -+ if ( pdev->vpci->msix ) -+ { -+ list_del(&pdev->vpci->msix->next); -+ if ( pdev->vpci->msix->pba ) -+ iounmap(pdev->vpci->msix->pba); -+ } - xfree(pdev->vpci->msix); - xfree(pdev->vpci->msi); - xfree(pdev->vpci); --- -2.37.4 - diff --git a/0037-x86-HVM-bound-number-of-pinned-cache-attribute-regio.patch b/0037-x86-HVM-bound-number-of-pinned-cache-attribute-regio.patch new file mode 100644 index 0000000..6730b2d --- /dev/null +++ b/0037-x86-HVM-bound-number-of-pinned-cache-attribute-regio.patch @@ -0,0 +1,50 @@ +From 2fe1517a00e088f6b1f1aff7d4ea1b477b288987 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Tue, 21 Mar 2023 12:01:01 +0000 +Subject: [PATCH 37/61] x86/HVM: bound number of pinned cache attribute regions + +This is exposed via DMOP, i.e. to potentially not fully privileged +device models. With that we may not permit registration of an (almost) +unbounded amount of such regions. + +This is CVE-2022-42333 / part of XSA-428. + +Fixes: 642123c5123f ("x86/hvm: provide XEN_DMOP_pin_memory_cacheattr") +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> +(cherry picked from commit a5e768640f786b681063f4e08af45d0c4e91debf) +--- + xen/arch/x86/hvm/mtrr.c | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/xen/arch/x86/hvm/mtrr.c b/xen/arch/x86/hvm/mtrr.c +index 4a9f3177ed..98e55bbdbd 100644 +--- a/xen/arch/x86/hvm/mtrr.c ++++ b/xen/arch/x86/hvm/mtrr.c +@@ -595,6 +595,7 @@ int hvm_set_mem_pinned_cacheattr(struct domain *d, uint64_t gfn_start, + uint64_t gfn_end, uint32_t type) + { + struct hvm_mem_pinned_cacheattr_range *range; ++ unsigned int nr = 0; + int rc = 1; + + if ( !is_hvm_domain(d) ) +@@ -666,11 +667,15 @@ int hvm_set_mem_pinned_cacheattr(struct domain *d, uint64_t gfn_start, + rc = -EBUSY; + break; + } ++ ++nr; + } + rcu_read_unlock(&pinned_cacheattr_rcu_lock); + if ( rc <= 0 ) + return rc; + ++ if ( nr >= 64 /* The limit is arbitrary. */ ) ++ return -ENOSPC; ++ + range = xzalloc(struct hvm_mem_pinned_cacheattr_range); + if ( range == NULL ) + return -ENOMEM; +-- +2.40.0 + diff --git a/0038-x86-HVM-serialize-pinned-cache-attribute-list-manipu.patch b/0038-x86-HVM-serialize-pinned-cache-attribute-list-manipu.patch new file mode 100644 index 0000000..ca8528f --- /dev/null +++ b/0038-x86-HVM-serialize-pinned-cache-attribute-list-manipu.patch @@ -0,0 +1,126 @@ +From 564de020d29fbc4efd20ef8052051e86b2465a1a Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Tue, 21 Mar 2023 12:01:01 +0000 +Subject: [PATCH 38/61] x86/HVM: serialize pinned cache attribute list + manipulation + +While the RCU variants of list insertion and removal allow lockless list +traversal (with RCU just read-locked), insertions and removals still +need serializing amongst themselves. To keep things simple, use the +domain lock for this purpose. + +This is CVE-2022-42334 / part of XSA-428. + +Fixes: 642123c5123f ("x86/hvm: provide XEN_DMOP_pin_memory_cacheattr") +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Julien Grall <jgrall@amazon.com> +(cherry picked from commit 829ec245cf66560e3b50d140ccb3168e7fb7c945) +--- + xen/arch/x86/hvm/mtrr.c | 51 +++++++++++++++++++++++++---------------- + 1 file changed, 31 insertions(+), 20 deletions(-) + +diff --git a/xen/arch/x86/hvm/mtrr.c b/xen/arch/x86/hvm/mtrr.c +index 98e55bbdbd..9b3b33012b 100644 +--- a/xen/arch/x86/hvm/mtrr.c ++++ b/xen/arch/x86/hvm/mtrr.c +@@ -594,7 +594,7 @@ static void free_pinned_cacheattr_entry(struct rcu_head *rcu) + int hvm_set_mem_pinned_cacheattr(struct domain *d, uint64_t gfn_start, + uint64_t gfn_end, uint32_t type) + { +- struct hvm_mem_pinned_cacheattr_range *range; ++ struct hvm_mem_pinned_cacheattr_range *range, *newr; + unsigned int nr = 0; + int rc = 1; + +@@ -608,14 +608,15 @@ int hvm_set_mem_pinned_cacheattr(struct domain *d, uint64_t gfn_start, + { + case XEN_DOMCTL_DELETE_MEM_CACHEATTR: + /* Remove the requested range. */ +- rcu_read_lock(&pinned_cacheattr_rcu_lock); +- list_for_each_entry_rcu ( range, +- &d->arch.hvm.pinned_cacheattr_ranges, +- list ) ++ domain_lock(d); ++ list_for_each_entry ( range, ++ &d->arch.hvm.pinned_cacheattr_ranges, ++ list ) + if ( range->start == gfn_start && range->end == gfn_end ) + { +- rcu_read_unlock(&pinned_cacheattr_rcu_lock); + list_del_rcu(&range->list); ++ domain_unlock(d); ++ + type = range->type; + call_rcu(&range->rcu, free_pinned_cacheattr_entry); + p2m_memory_type_changed(d); +@@ -636,7 +637,7 @@ int hvm_set_mem_pinned_cacheattr(struct domain *d, uint64_t gfn_start, + } + return 0; + } +- rcu_read_unlock(&pinned_cacheattr_rcu_lock); ++ domain_unlock(d); + return -ENOENT; + + case PAT_TYPE_UC_MINUS: +@@ -651,7 +652,10 @@ int hvm_set_mem_pinned_cacheattr(struct domain *d, uint64_t gfn_start, + return -EINVAL; + } + +- rcu_read_lock(&pinned_cacheattr_rcu_lock); ++ newr = xzalloc(struct hvm_mem_pinned_cacheattr_range); ++ ++ domain_lock(d); ++ + list_for_each_entry_rcu ( range, + &d->arch.hvm.pinned_cacheattr_ranges, + list ) +@@ -669,27 +673,34 @@ int hvm_set_mem_pinned_cacheattr(struct domain *d, uint64_t gfn_start, + } + ++nr; + } +- rcu_read_unlock(&pinned_cacheattr_rcu_lock); ++ + if ( rc <= 0 ) +- return rc; ++ /* nothing */; ++ else if ( nr >= 64 /* The limit is arbitrary. */ ) ++ rc = -ENOSPC; ++ else if ( !newr ) ++ rc = -ENOMEM; ++ else ++ { ++ newr->start = gfn_start; ++ newr->end = gfn_end; ++ newr->type = type; + +- if ( nr >= 64 /* The limit is arbitrary. */ ) +- return -ENOSPC; ++ list_add_rcu(&newr->list, &d->arch.hvm.pinned_cacheattr_ranges); + +- range = xzalloc(struct hvm_mem_pinned_cacheattr_range); +- if ( range == NULL ) +- return -ENOMEM; ++ newr = NULL; ++ rc = 0; ++ } ++ ++ domain_unlock(d); + +- range->start = gfn_start; +- range->end = gfn_end; +- range->type = type; ++ xfree(newr); + +- list_add_rcu(&range->list, &d->arch.hvm.pinned_cacheattr_ranges); + p2m_memory_type_changed(d); + if ( type != PAT_TYPE_WRBACK ) + flush_all(FLUSH_CACHE); + +- return 0; ++ return rc; + } + + static int hvm_save_mtrr_msr(struct vcpu *v, hvm_domain_context_t *h) +-- +2.40.0 + diff --git a/0038-x86-also-zap-secondary-time-area-handles-during-soft.patch b/0038-x86-also-zap-secondary-time-area-handles-during-soft.patch deleted file mode 100644 index 286661a..0000000 --- a/0038-x86-also-zap-secondary-time-area-handles-during-soft.patch +++ /dev/null @@ -1,49 +0,0 @@ -From aac108509055e5f5ff293e1fb44614f96a0996c6 Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Mon, 31 Oct 2022 13:26:08 +0100 -Subject: [PATCH 38/87] x86: also zap secondary time area handles during soft - reset -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Just like domain_soft_reset() properly zaps runstate area handles, the -secondary time area ones also need discarding to prevent guest memory -corruption once the guest is re-started. - -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> -master commit: b80d4f8d2ea6418e32fb4f20d1304ace6d6566e3 -master date: 2022-10-27 11:49:09 +0200 ---- - xen/arch/x86/domain.c | 6 ++++++ - 1 file changed, 6 insertions(+) - -diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c -index a4356893bdbc..3fab2364be8d 100644 ---- a/xen/arch/x86/domain.c -+++ b/xen/arch/x86/domain.c -@@ -929,6 +929,7 @@ int arch_domain_soft_reset(struct domain *d) - struct page_info *page = virt_to_page(d->shared_info), *new_page; - int ret = 0; - struct domain *owner; -+ struct vcpu *v; - mfn_t mfn; - gfn_t gfn; - p2m_type_t p2mt; -@@ -1008,7 +1009,12 @@ int arch_domain_soft_reset(struct domain *d) - "Failed to add a page to replace %pd's shared_info frame %"PRI_gfn"\n", - d, gfn_x(gfn)); - free_domheap_page(new_page); -+ goto exit_put_gfn; - } -+ -+ for_each_vcpu ( d, v ) -+ set_xen_guest_handle(v->arch.time_info_guest, NULL); -+ - exit_put_gfn: - put_gfn(d, gfn_x(gfn)); - exit_put_page: --- -2.37.4 - diff --git a/0039-common-map_vcpu_info-wants-to-unshare-the-underlying.patch b/0039-common-map_vcpu_info-wants-to-unshare-the-underlying.patch deleted file mode 100644 index cea8bb5..0000000 --- a/0039-common-map_vcpu_info-wants-to-unshare-the-underlying.patch +++ /dev/null @@ -1,41 +0,0 @@ -From 426a8346c01075ec5eba4aadefab03a96b6ece6a Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Mon, 31 Oct 2022 13:26:33 +0100 -Subject: [PATCH 39/87] common: map_vcpu_info() wants to unshare the underlying - page -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Not passing P2M_UNSHARE to get_page_from_gfn() means there won't even be -an attempt to unshare the referenced page, without any indication to the -caller (e.g. -EAGAIN). Note that guests have no direct control over -which of their pages are shared (or paged out), and hence they have no -way to make sure all on their own that the subsequent obtaining of a -writable type reference can actually succeed. - -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> -Acked-by: Julien Grall <jgrall@amazon.com> -master commit: 48980cf24d5cf41fd644600f99c753419505e735 -master date: 2022-10-28 11:38:32 +0200 ---- - xen/common/domain.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/xen/common/domain.c b/xen/common/domain.c -index 56d47dd66478..e3afcacb6cae 100644 ---- a/xen/common/domain.c -+++ b/xen/common/domain.c -@@ -1471,7 +1471,7 @@ int map_vcpu_info(struct vcpu *v, unsigned long gfn, unsigned offset) - if ( (v != current) && !(v->pause_flags & VPF_down) ) - return -EINVAL; - -- page = get_page_from_gfn(d, gfn, NULL, P2M_ALLOC); -+ page = get_page_from_gfn(d, gfn, NULL, P2M_UNSHARE); - if ( !page ) - return -EINVAL; - --- -2.37.4 - diff --git a/0039-x86-spec-ctrl-Defer-CR4_PV32_RESTORE-on-the-cstar_en.patch b/0039-x86-spec-ctrl-Defer-CR4_PV32_RESTORE-on-the-cstar_en.patch new file mode 100644 index 0000000..74bcf67 --- /dev/null +++ b/0039-x86-spec-ctrl-Defer-CR4_PV32_RESTORE-on-the-cstar_en.patch @@ -0,0 +1,56 @@ +From 3c924fe46b455834b5c04268db6b528b549668d1 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper <andrew.cooper3@citrix.com> +Date: Fri, 10 Feb 2023 21:11:14 +0000 +Subject: [PATCH 39/61] x86/spec-ctrl: Defer CR4_PV32_RESTORE on the + cstar_enter path + +As stated (correctly) by the comment next to SPEC_CTRL_ENTRY_FROM_PV, between +the two hunks visible in the patch, RET's are not safe prior to this point. + +CR4_PV32_RESTORE hides a CALL/RET pair in certain configurations (PV32 +compiled in, SMEP or SMAP active), and the RET can be attacked with one of +several known speculative issues. + +Furthermore, CR4_PV32_RESTORE also hides a reference to the cr4_pv32_mask +global variable, which is not safe when XPTI is active before restoring Xen's +full pagetables. + +This crash has gone unnoticed because it is only AMD CPUs which permit the +SYSCALL instruction in compatibility mode, and these are not vulnerable to +Meltdown so don't activate XPTI by default. + +This is XSA-429 / CVE-2022-42331 + +Fixes: 5e7962901131 ("x86/entry: Organise the use of MSR_SPEC_CTRL at each entry/exit point") +Fixes: 5784de3e2067 ("x86: Meltdown band-aid against malicious 64-bit PV guests") +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +(cherry picked from commit df5b055b12116d9e63ced59ae5389e69a2a3de48) +--- + xen/arch/x86/x86_64/entry.S | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S +index fba8ae498f..db2ea7871e 100644 +--- a/xen/arch/x86/x86_64/entry.S ++++ b/xen/arch/x86/x86_64/entry.S +@@ -288,7 +288,6 @@ ENTRY(cstar_enter) + ALTERNATIVE "", "setssbsy", X86_FEATURE_XEN_SHSTK + #endif + push %rax /* Guest %rsp */ +- CR4_PV32_RESTORE + movq 8(%rsp), %rax /* Restore guest %rax. */ + movq $FLAT_USER_SS32, 8(%rsp) /* Assume a 64bit domain. Compat handled lower. */ + pushq %r11 +@@ -312,6 +311,8 @@ ENTRY(cstar_enter) + .Lcstar_cr3_okay: + sti + ++ CR4_PV32_RESTORE ++ + movq STACK_CPUINFO_FIELD(current_vcpu)(%rbx), %rbx + + #ifdef CONFIG_PV32 +-- +2.40.0 + diff --git a/0040-tools-python-change-s-size-type-for-Python-3.10.patch b/0040-tools-python-change-s-size-type-for-Python-3.10.patch new file mode 100644 index 0000000..979fd6f --- /dev/null +++ b/0040-tools-python-change-s-size-type-for-Python-3.10.patch @@ -0,0 +1,72 @@ +From 0cbffc6099db7fd01041910a98b99ccad50af11b Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?= + <marmarek@invisiblethingslab.com> +Date: Tue, 21 Mar 2023 13:49:28 +0100 +Subject: [PATCH 40/61] tools/python: change 's#' size type for Python >= 3.10 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Python < 3.10 by default uses 'int' type for data+size string types +(s#), unless PY_SSIZE_T_CLEAN is defined - in which case it uses +Py_ssize_t. The former behavior was removed in Python 3.10 and now it's +required to define PY_SSIZE_T_CLEAN before including Python.h, and using +Py_ssize_t for the length argument. The PY_SSIZE_T_CLEAN behavior is +supported since Python 2.5. + +Adjust bindings accordingly. + +Signed-off-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com> +Reviewed-by: Anthony PERARD <anthony.perard@citrix.com> +master commit: 897257ba49d0a6ddcf084960fd792ccce9c40f94 +master date: 2023-02-06 08:50:13 +0100 +--- + tools/python/xen/lowlevel/xc/xc.c | 3 ++- + tools/python/xen/lowlevel/xs/xs.c | 3 ++- + 2 files changed, 4 insertions(+), 2 deletions(-) + +diff --git a/tools/python/xen/lowlevel/xc/xc.c b/tools/python/xen/lowlevel/xc/xc.c +index fd00861032..cfb2734a99 100644 +--- a/tools/python/xen/lowlevel/xc/xc.c ++++ b/tools/python/xen/lowlevel/xc/xc.c +@@ -4,6 +4,7 @@ + * Copyright (c) 2003-2004, K A Fraser (University of Cambridge) + */ + ++#define PY_SSIZE_T_CLEAN + #include <Python.h> + #define XC_WANT_COMPAT_MAP_FOREIGN_API + #include <xenctrl.h> +@@ -1774,7 +1775,7 @@ static PyObject *pyflask_load(PyObject *self, PyObject *args, PyObject *kwds) + { + xc_interface *xc_handle; + char *policy; +- uint32_t len; ++ Py_ssize_t len; + int ret; + + static char *kwd_list[] = { "policy", NULL }; +diff --git a/tools/python/xen/lowlevel/xs/xs.c b/tools/python/xen/lowlevel/xs/xs.c +index 0dad7fa5f2..3ba5a8b893 100644 +--- a/tools/python/xen/lowlevel/xs/xs.c ++++ b/tools/python/xen/lowlevel/xs/xs.c +@@ -18,6 +18,7 @@ + * Copyright (C) 2005 XenSource Ltd. + */ + ++#define PY_SSIZE_T_CLEAN + #include <Python.h> + + #include <stdbool.h> +@@ -141,7 +142,7 @@ static PyObject *xspy_write(XsHandle *self, PyObject *args) + char *thstr; + char *path; + char *data; +- int data_n; ++ Py_ssize_t data_n; + bool result; + + if (!xh) +-- +2.40.0 + diff --git a/0040-x86-pv-shim-correctly-ignore-empty-onlining-requests.patch b/0040-x86-pv-shim-correctly-ignore-empty-onlining-requests.patch deleted file mode 100644 index d242cb2..0000000 --- a/0040-x86-pv-shim-correctly-ignore-empty-onlining-requests.patch +++ /dev/null @@ -1,43 +0,0 @@ -From 08f6c88405a4406cac5b90e8d9873258dc445006 Mon Sep 17 00:00:00 2001 -From: Igor Druzhinin <igor.druzhinin@citrix.com> -Date: Mon, 31 Oct 2022 13:26:59 +0100 -Subject: [PATCH 40/87] x86/pv-shim: correctly ignore empty onlining requests -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Mem-op requests may have zero extents. Such requests need treating as -no-ops. pv_shim_online_memory(), however, would have tried to take 2³²-1 -order-sized pages from its balloon list (to then populate them), -typically ending when the entire set of ballooned pages of this order -was consumed. - -Note that pv_shim_offline_memory() does not have such an issue. - -Fixes: b2245acc60c3 ("xen/pvshim: memory hotplug") -Signed-off-by: Igor Druzhinin <igor.druzhinin@citrix.com> -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> -master commit: 9272225ca72801fd9fa5b268a2d1c5adebd19cd9 -master date: 2022-10-28 15:47:59 +0200 ---- - xen/arch/x86/pv/shim.c | 3 +++ - 1 file changed, 3 insertions(+) - -diff --git a/xen/arch/x86/pv/shim.c b/xen/arch/x86/pv/shim.c -index d9704121a739..4146ee3f9ce8 100644 ---- a/xen/arch/x86/pv/shim.c -+++ b/xen/arch/x86/pv/shim.c -@@ -944,6 +944,9 @@ void pv_shim_online_memory(unsigned int nr, unsigned int order) - struct page_info *page, *tmp; - PAGE_LIST_HEAD(list); - -+ if ( !nr ) -+ return; -+ - spin_lock(&balloon_lock); - page_list_for_each_safe ( page, tmp, &balloon ) - { --- -2.37.4 - diff --git a/0041-tools-xenmon-Fix-xenmon.py-for-with-python3.x.patch b/0041-tools-xenmon-Fix-xenmon.py-for-with-python3.x.patch new file mode 100644 index 0000000..ff97af6 --- /dev/null +++ b/0041-tools-xenmon-Fix-xenmon.py-for-with-python3.x.patch @@ -0,0 +1,54 @@ +From 5ce8d2aef85f590e4fb42d18784512203069d0c0 Mon Sep 17 00:00:00 2001 +From: Bernhard Kaindl <bernhard.kaindl@citrix.com> +Date: Tue, 21 Mar 2023 13:49:47 +0100 +Subject: [PATCH 41/61] tools/xenmon: Fix xenmon.py for with python3.x + +Fixes for Py3: +* class Delayed(): file not defined; also an error for pylint -E. Inherit + object instead for Py2 compatibility. Fix DomainInfo() too. +* Inconsistent use of tabs and spaces for indentation (in one block) + +Signed-off-by: Bernhard Kaindl <bernhard.kaindl@citrix.com> +Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> +master commit: 3a59443c1d5ae0677a792c660ccd3796ce036732 +master date: 2023-02-06 10:22:12 +0000 +--- + tools/xenmon/xenmon.py | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/tools/xenmon/xenmon.py b/tools/xenmon/xenmon.py +index 175eacd2cb..977ada6887 100644 +--- a/tools/xenmon/xenmon.py ++++ b/tools/xenmon/xenmon.py +@@ -117,7 +117,7 @@ def setup_cmdline_parser(): + return parser + + # encapsulate information about a domain +-class DomainInfo: ++class DomainInfo(object): + def __init__(self): + self.allocated_sum = 0 + self.gotten_sum = 0 +@@ -533,7 +533,7 @@ def show_livestats(cpu): + # simple functions to allow initialization of log files without actually + # physically creating files that are never used; only on the first real + # write does the file get created +-class Delayed(file): ++class Delayed(object): + def __init__(self, filename, mode): + self.filename = filename + self.saved_mode = mode +@@ -677,8 +677,8 @@ def main(): + + if os.uname()[0] == "SunOS": + xenbaked_cmd = "/usr/lib/xenbaked" +- stop_cmd = "/usr/bin/pkill -INT -z global xenbaked" +- kill_cmd = "/usr/bin/pkill -KILL -z global xenbaked" ++ stop_cmd = "/usr/bin/pkill -INT -z global xenbaked" ++ kill_cmd = "/usr/bin/pkill -KILL -z global xenbaked" + else: + # assumes that xenbaked is in your path + xenbaked_cmd = "xenbaked" +-- +2.40.0 + diff --git a/0041-x86-pv-shim-correct-ballooning-up-for-compat-guests.patch b/0041-x86-pv-shim-correct-ballooning-up-for-compat-guests.patch deleted file mode 100644 index 5c77bbf..0000000 --- a/0041-x86-pv-shim-correct-ballooning-up-for-compat-guests.patch +++ /dev/null @@ -1,55 +0,0 @@ -From 2f75e3654f00a62bd1f446a7424ccd56750a2e15 Mon Sep 17 00:00:00 2001 -From: Igor Druzhinin <igor.druzhinin@citrix.com> -Date: Mon, 31 Oct 2022 13:28:15 +0100 -Subject: [PATCH 41/87] x86/pv-shim: correct ballooning up for compat guests - -The compat layer for multi-extent memory ops may need to split incoming -requests. Since the guest handles in the interface structures may not be -altered, it does so by leveraging do_memory_op()'s continuation -handling: It hands on non-initial requests with a non-zero start extent, -with the (native) handle suitably adjusted down. As a result -do_memory_op() sees only the first of potentially several requests with -start extent being zero. It's only that case when the function would -issue a call to pv_shim_online_memory(), yet the range then covers only -the first sub-range that results from the split. - -Address that breakage by making a complementary call to -pv_shim_online_memory() in compat layer. - -Fixes: b2245acc60c3 ("xen/pvshim: memory hotplug") -Signed-off-by: Igor Druzhinin <igor.druzhinin@citrix.com> -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> -master commit: a0bfdd201ea12aa5679bb8944d63a4e0d3c23160 -master date: 2022-10-28 15:48:50 +0200 ---- - xen/common/compat/memory.c | 6 +++++- - 1 file changed, 5 insertions(+), 1 deletion(-) - -diff --git a/xen/common/compat/memory.c b/xen/common/compat/memory.c -index c43fa97cf15f..a0e0562a4033 100644 ---- a/xen/common/compat/memory.c -+++ b/xen/common/compat/memory.c -@@ -7,6 +7,7 @@ EMIT_FILE; - #include <xen/event.h> - #include <xen/mem_access.h> - #include <asm/current.h> -+#include <asm/guest.h> - #include <compat/memory.h> - - #define xen_domid_t domid_t -@@ -146,7 +147,10 @@ int compat_memory_op(unsigned int cmd, XEN_GUEST_HANDLE_PARAM(void) compat) - nat.rsrv->nr_extents = end_extent; - ++split; - } -- -+ /* Avoid calling pv_shim_online_memory() when in a continuation. */ -+ if ( pv_shim && op != XENMEM_decrease_reservation && !start_extent ) -+ pv_shim_online_memory(cmp.rsrv.nr_extents - nat.rsrv->nr_extents, -+ cmp.rsrv.extent_order); - break; - - case XENMEM_exchange: --- -2.37.4 - diff --git a/0042-core-parking-fix-build-with-gcc12-and-NR_CPUS-1.patch b/0042-core-parking-fix-build-with-gcc12-and-NR_CPUS-1.patch new file mode 100644 index 0000000..c425c43 --- /dev/null +++ b/0042-core-parking-fix-build-with-gcc12-and-NR_CPUS-1.patch @@ -0,0 +1,95 @@ +From 4a6bedefe589dab12182d6b974de8ea3b2fcc681 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Tue, 21 Mar 2023 13:50:18 +0100 +Subject: [PATCH 42/61] core-parking: fix build with gcc12 and NR_CPUS=1 + +Gcc12 takes issue with core_parking_remove()'s + + for ( ; i < cur_idle_nums; ++i ) + core_parking_cpunum[i] = core_parking_cpunum[i + 1]; + +complaining that the right hand side array access is past the bounds of +1. Clearly the compiler can't know that cur_idle_nums can only ever be +zero in this case (as the sole CPU cannot be parked). + +Arrange for core_parking.c's contents to not be needed altogether, and +then disable its building when NR_CPUS == 1. + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> +master commit: 4b0422f70feb4b1cd04598ffde805fc224f3812e +master date: 2023-03-13 15:15:42 +0100 +--- + xen/arch/x86/Kconfig | 2 +- + xen/arch/x86/platform_hypercall.c | 11 ++++++++--- + xen/arch/x86/sysctl.c | 3 +++ + xen/common/Kconfig | 1 + + 4 files changed, 13 insertions(+), 4 deletions(-) + +diff --git a/xen/arch/x86/Kconfig b/xen/arch/x86/Kconfig +index 3c14096c80..8e2b504923 100644 +--- a/xen/arch/x86/Kconfig ++++ b/xen/arch/x86/Kconfig +@@ -8,7 +8,7 @@ config X86 + select ACPI_LEGACY_TABLES_LOOKUP + select ALTERNATIVE_CALL + select ARCH_SUPPORTS_INT128 +- select CORE_PARKING ++ imply CORE_PARKING + select HAS_ALTERNATIVE + select HAS_COMPAT + select HAS_CPUFREQ +diff --git a/xen/arch/x86/platform_hypercall.c b/xen/arch/x86/platform_hypercall.c +index bf4090c942..c35e5669a4 100644 +--- a/xen/arch/x86/platform_hypercall.c ++++ b/xen/arch/x86/platform_hypercall.c +@@ -725,12 +725,17 @@ ret_t do_platform_op(XEN_GUEST_HANDLE_PARAM(xen_platform_op_t) u_xenpf_op) + case XEN_CORE_PARKING_SET: + idle_nums = min_t(uint32_t, + op->u.core_parking.idle_nums, num_present_cpus() - 1); +- ret = continue_hypercall_on_cpu( +- 0, core_parking_helper, (void *)(unsigned long)idle_nums); ++ if ( CONFIG_NR_CPUS > 1 ) ++ ret = continue_hypercall_on_cpu( ++ 0, core_parking_helper, ++ (void *)(unsigned long)idle_nums); ++ else if ( idle_nums ) ++ ret = -EINVAL; + break; + + case XEN_CORE_PARKING_GET: +- op->u.core_parking.idle_nums = get_cur_idle_nums(); ++ op->u.core_parking.idle_nums = CONFIG_NR_CPUS > 1 ++ ? get_cur_idle_nums() : 0; + ret = __copy_field_to_guest(u_xenpf_op, op, u.core_parking) ? + -EFAULT : 0; + break; +diff --git a/xen/arch/x86/sysctl.c b/xen/arch/x86/sysctl.c +index aff52a13f3..ff843eaee2 100644 +--- a/xen/arch/x86/sysctl.c ++++ b/xen/arch/x86/sysctl.c +@@ -179,6 +179,9 @@ long arch_do_sysctl( + ret = -EBUSY; + break; + } ++ if ( CONFIG_NR_CPUS <= 1 ) ++ /* Mimic behavior of smt_up_down_helper(). */ ++ return 0; + plug = op == XEN_SYSCTL_CPU_HOTPLUG_SMT_ENABLE; + fn = smt_up_down_helper; + hcpu = _p(plug); +diff --git a/xen/common/Kconfig b/xen/common/Kconfig +index 6443943889..c9f4b7f492 100644 +--- a/xen/common/Kconfig ++++ b/xen/common/Kconfig +@@ -10,6 +10,7 @@ config COMPAT + + config CORE_PARKING + bool ++ depends on NR_CPUS > 1 + + config GRANT_TABLE + bool "Grant table support" if EXPERT +-- +2.40.0 + diff --git a/0042-x86-pv-shim-correct-ballooning-down-for-compat-guest.patch b/0042-x86-pv-shim-correct-ballooning-down-for-compat-guest.patch deleted file mode 100644 index dd044e4..0000000 --- a/0042-x86-pv-shim-correct-ballooning-down-for-compat-guest.patch +++ /dev/null @@ -1,72 +0,0 @@ -From c229b16ba3eb5579a9a5d470ab16dd9ad55e57d6 Mon Sep 17 00:00:00 2001 -From: Igor Druzhinin <igor.druzhinin@citrix.com> -Date: Mon, 31 Oct 2022 13:28:46 +0100 -Subject: [PATCH 42/87] x86/pv-shim: correct ballooning down for compat guests - -The compat layer for multi-extent memory ops may need to split incoming -requests. Since the guest handles in the interface structures may not be -altered, it does so by leveraging do_memory_op()'s continuation -handling: It hands on non-initial requests with a non-zero start extent, -with the (native) handle suitably adjusted down. As a result -do_memory_op() sees only the first of potentially several requests with -start extent being zero. In order to be usable as overall result, the -function accumulates args.nr_done, i.e. it initialized the field with -the start extent. Therefore non-initial requests resulting from the -split would pass too large a number into pv_shim_offline_memory(). - -Address that breakage by always calling pv_shim_offline_memory() -regardless of current hypercall preemption status, with a suitably -adjusted first argument. Note that this is correct also for the native -guest case: We now simply "commit" what was completed right away, rather -than at the end of a series of preemption/re-start cycles. In fact this -improves overall preemption behavior: There's no longer a potentially -big chunk of work done non-preemptively at the end of the last -"iteration". - -Fixes: b2245acc60c3 ("xen/pvshim: memory hotplug") -Signed-off-by: Igor Druzhinin <igor.druzhinin@citrix.com> -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> -master commit: 1d7fbc535d1d37bdc2cc53ede360b0f6651f7de1 -master date: 2022-10-28 15:49:33 +0200 ---- - xen/common/memory.c | 19 +++++++------------ - 1 file changed, 7 insertions(+), 12 deletions(-) - -diff --git a/xen/common/memory.c b/xen/common/memory.c -index 064de4ad8d66..76f8858cc379 100644 ---- a/xen/common/memory.c -+++ b/xen/common/memory.c -@@ -1420,22 +1420,17 @@ long do_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg) - - rc = args.nr_done; - -- if ( args.preempted ) -- return hypercall_create_continuation( -- __HYPERVISOR_memory_op, "lh", -- op | (rc << MEMOP_EXTENT_SHIFT), arg); -- - #ifdef CONFIG_X86 - if ( pv_shim && op == XENMEM_decrease_reservation ) -- /* -- * Only call pv_shim_offline_memory when the hypercall has -- * finished. Note that nr_done is used to cope in case the -- * hypercall has failed and only part of the extents where -- * processed. -- */ -- pv_shim_offline_memory(args.nr_done, args.extent_order); -+ pv_shim_offline_memory(args.nr_done - start_extent, -+ args.extent_order); - #endif - -+ if ( args.preempted ) -+ return hypercall_create_continuation( -+ __HYPERVISOR_memory_op, "lh", -+ op | (rc << MEMOP_EXTENT_SHIFT), arg); -+ - break; - - case XENMEM_exchange: --- -2.37.4 - diff --git a/0043-x86-altp2m-help-gcc13-to-avoid-it-emitting-a-warning.patch b/0043-x86-altp2m-help-gcc13-to-avoid-it-emitting-a-warning.patch new file mode 100644 index 0000000..0e040ad --- /dev/null +++ b/0043-x86-altp2m-help-gcc13-to-avoid-it-emitting-a-warning.patch @@ -0,0 +1,129 @@ +From cdde3171a2a932a6836b094c4387412e27414ec9 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Tue, 21 Mar 2023 13:51:42 +0100 +Subject: [PATCH 43/61] x86/altp2m: help gcc13 to avoid it emitting a warning + +Switches of altp2m-s always expect a valid altp2m to be in place (and +indeed altp2m_vcpu_initialise() sets the active one to be at index 0). +The compiler, however, cannot know that, and hence it cannot eliminate +p2m_get_altp2m()'s case of returnin (literal) NULL. If then the compiler +decides to special case that code path in the caller, the dereference in +instances of + + atomic_dec(&p2m_get_altp2m(v)->active_vcpus); + +can, to the code generator, appear to be NULL dereferences, leading to + +In function 'atomic_dec', + inlined from '...' at ...: +./arch/x86/include/asm/atomic.h:182:5: error: array subscript 0 is outside array bounds of 'int[0]' [-Werror=array-bounds=] + +Aid the compiler by adding a BUG_ON() checking the return value of the +problematic p2m_get_altp2m(). Since with the use of the local variable +the 2nd p2m_get_altp2m() each will look questionable at the first glance +(Why is the local variable not used here?), open-code the only relevant +piece of p2m_get_altp2m() there. + +To avoid repeatedly doing these transformations, and also to limit how +"bad" the open-coding really is, convert the entire operation to an +inline helper, used by all three instances (and accepting the redundant +BUG_ON(idx >= MAX_ALTP2M) in two of the three cases). + +Reported-by: Charles Arnold <carnold@suse.com> +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> +master commit: be62b1fc2aa7375d553603fca07299da765a89fe +master date: 2023-03-13 15:16:21 +0100 +--- + xen/arch/x86/hvm/vmx/vmx.c | 8 +------- + xen/arch/x86/mm/p2m.c | 14 ++------------ + xen/include/asm-x86/p2m.h | 20 ++++++++++++++++++++ + 3 files changed, 23 insertions(+), 19 deletions(-) + +diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c +index 094141be9a..c8a839cd5e 100644 +--- a/xen/arch/x86/hvm/vmx/vmx.c ++++ b/xen/arch/x86/hvm/vmx/vmx.c +@@ -4036,13 +4036,7 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs) + } + } + +- if ( idx != vcpu_altp2m(v).p2midx ) +- { +- BUG_ON(idx >= MAX_ALTP2M); +- atomic_dec(&p2m_get_altp2m(v)->active_vcpus); +- vcpu_altp2m(v).p2midx = idx; +- atomic_inc(&p2m_get_altp2m(v)->active_vcpus); +- } ++ p2m_set_altp2m(v, idx); + } + + /* XXX: This looks ugly, but we need a mechanism to ensure +diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c +index 8781df9dda..2d41446a69 100644 +--- a/xen/arch/x86/mm/p2m.c ++++ b/xen/arch/x86/mm/p2m.c +@@ -2194,13 +2194,8 @@ bool_t p2m_switch_vcpu_altp2m_by_id(struct vcpu *v, unsigned int idx) + + if ( d->arch.altp2m_eptp[idx] != mfn_x(INVALID_MFN) ) + { +- if ( idx != vcpu_altp2m(v).p2midx ) +- { +- atomic_dec(&p2m_get_altp2m(v)->active_vcpus); +- vcpu_altp2m(v).p2midx = idx; +- atomic_inc(&p2m_get_altp2m(v)->active_vcpus); ++ if ( p2m_set_altp2m(v, idx) ) + altp2m_vcpu_update_p2m(v); +- } + rc = 1; + } + +@@ -2471,13 +2466,8 @@ int p2m_switch_domain_altp2m_by_id(struct domain *d, unsigned int idx) + if ( d->arch.altp2m_visible_eptp[idx] != mfn_x(INVALID_MFN) ) + { + for_each_vcpu( d, v ) +- if ( idx != vcpu_altp2m(v).p2midx ) +- { +- atomic_dec(&p2m_get_altp2m(v)->active_vcpus); +- vcpu_altp2m(v).p2midx = idx; +- atomic_inc(&p2m_get_altp2m(v)->active_vcpus); ++ if ( p2m_set_altp2m(v, idx) ) + altp2m_vcpu_update_p2m(v); +- } + + rc = 0; + } +diff --git a/xen/include/asm-x86/p2m.h b/xen/include/asm-x86/p2m.h +index 2db9ab0122..f92bb97394 100644 +--- a/xen/include/asm-x86/p2m.h ++++ b/xen/include/asm-x86/p2m.h +@@ -841,6 +841,26 @@ static inline struct p2m_domain *p2m_get_altp2m(struct vcpu *v) + return v->domain->arch.altp2m_p2m[index]; + } + ++/* set current alternate p2m table */ ++static inline bool p2m_set_altp2m(struct vcpu *v, unsigned int idx) ++{ ++ struct p2m_domain *orig; ++ ++ BUG_ON(idx >= MAX_ALTP2M); ++ ++ if ( idx == vcpu_altp2m(v).p2midx ) ++ return false; ++ ++ orig = p2m_get_altp2m(v); ++ BUG_ON(!orig); ++ atomic_dec(&orig->active_vcpus); ++ ++ vcpu_altp2m(v).p2midx = idx; ++ atomic_inc(&v->domain->arch.altp2m_p2m[idx]->active_vcpus); ++ ++ return true; ++} ++ + /* Switch alternate p2m for a single vcpu */ + bool_t p2m_switch_vcpu_altp2m_by_id(struct vcpu *v, unsigned int idx); + +-- +2.40.0 + diff --git a/0043-x86-vmx-Revert-VMX-use-a-single-global-APIC-access-p.patch b/0043-x86-vmx-Revert-VMX-use-a-single-global-APIC-access-p.patch deleted file mode 100644 index 92b3bf1..0000000 --- a/0043-x86-vmx-Revert-VMX-use-a-single-global-APIC-access-p.patch +++ /dev/null @@ -1,259 +0,0 @@ -From 62e7fb702db4adaa9415ac87d95e0f461e32d9ca Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Wed, 24 Aug 2022 14:16:44 +0100 -Subject: [PATCH 43/87] x86/vmx: Revert "VMX: use a single, global APIC access - page" - -The claim "No accesses would ever go to this page." is false. A consequence -of how Intel's APIC Acceleration works, and Xen's choice to have per-domain -P2Ms (rather than per-vCPU P2Ms) means that the APIC page is fully read-write -to any vCPU which is not in xAPIC mode. - -This reverts commit 58850b9074d3e7affdf3bc94c84e417ecfa4d165. - -This is XSA-412 / CVE-2022-42327. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit 3b5beaf49033cddf4b2cc4e4d391b966f4203471) ---- - xen/arch/x86/hvm/vmx/vmx.c | 59 ++++++++++++++++++++++-------- - xen/arch/x86/mm/shadow/set.c | 8 ---- - xen/arch/x86/mm/shadow/types.h | 7 ---- - xen/include/asm-x86/hvm/vmx/vmcs.h | 1 + - xen/include/asm-x86/mm.h | 20 +--------- - 5 files changed, 46 insertions(+), 49 deletions(-) - -diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c -index d429d76c18c9..3f4276531322 100644 ---- a/xen/arch/x86/hvm/vmx/vmx.c -+++ b/xen/arch/x86/hvm/vmx/vmx.c -@@ -66,7 +66,8 @@ boolean_param("force-ept", opt_force_ept); - static void vmx_ctxt_switch_from(struct vcpu *v); - static void vmx_ctxt_switch_to(struct vcpu *v); - --static int alloc_vlapic_mapping(void); -+static int vmx_alloc_vlapic_mapping(struct domain *d); -+static void vmx_free_vlapic_mapping(struct domain *d); - static void vmx_install_vlapic_mapping(struct vcpu *v); - static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr, - unsigned int flags); -@@ -77,8 +78,6 @@ static int vmx_msr_read_intercept(unsigned int msr, uint64_t *msr_content); - static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content); - static void vmx_invlpg(struct vcpu *v, unsigned long linear); - --static mfn_t __read_mostly apic_access_mfn = INVALID_MFN_INITIALIZER; -- - /* Values for domain's ->arch.hvm_domain.pi_ops.flags. */ - #define PI_CSW_FROM (1u << 0) - #define PI_CSW_TO (1u << 1) -@@ -402,6 +401,7 @@ static int vmx_domain_initialise(struct domain *d) - .to = vmx_ctxt_switch_to, - .tail = vmx_do_resume, - }; -+ int rc; - - d->arch.ctxt_switch = &csw; - -@@ -411,15 +411,24 @@ static int vmx_domain_initialise(struct domain *d) - */ - d->arch.hvm.vmx.exec_sp = is_hardware_domain(d) || opt_ept_exec_sp; - -+ if ( (rc = vmx_alloc_vlapic_mapping(d)) != 0 ) -+ return rc; -+ - return 0; - } - -+static void vmx_domain_relinquish_resources(struct domain *d) -+{ -+ vmx_free_vlapic_mapping(d); -+} -+ - static void domain_creation_finished(struct domain *d) - { - gfn_t gfn = gaddr_to_gfn(APIC_DEFAULT_PHYS_BASE); -+ mfn_t apic_access_mfn = d->arch.hvm.vmx.apic_access_mfn; - bool ipat; - -- if ( !has_vlapic(d) || mfn_eq(apic_access_mfn, INVALID_MFN) ) -+ if ( mfn_eq(apic_access_mfn, _mfn(0)) ) - return; - - ASSERT(epte_get_entry_emt(d, gfn, apic_access_mfn, 0, &ipat, -@@ -2481,6 +2490,7 @@ static struct hvm_function_table __initdata vmx_function_table = { - .cpu_up_prepare = vmx_cpu_up_prepare, - .cpu_dead = vmx_cpu_dead, - .domain_initialise = vmx_domain_initialise, -+ .domain_relinquish_resources = vmx_domain_relinquish_resources, - .domain_creation_finished = domain_creation_finished, - .vcpu_initialise = vmx_vcpu_initialise, - .vcpu_destroy = vmx_vcpu_destroy, -@@ -2731,7 +2741,7 @@ const struct hvm_function_table * __init start_vmx(void) - { - set_in_cr4(X86_CR4_VMXE); - -- if ( vmx_vmcs_init() || alloc_vlapic_mapping() ) -+ if ( vmx_vmcs_init() ) - { - printk("VMX: failed to initialise.\n"); - return NULL; -@@ -3305,36 +3315,55 @@ gp_fault: - return X86EMUL_EXCEPTION; - } - --static int __init alloc_vlapic_mapping(void) -+static int vmx_alloc_vlapic_mapping(struct domain *d) - { - struct page_info *pg; - mfn_t mfn; - -- if ( !cpu_has_vmx_virtualize_apic_accesses ) -+ if ( !has_vlapic(d) || !cpu_has_vmx_virtualize_apic_accesses ) - return 0; - -- pg = alloc_domheap_page(NULL, 0); -+ pg = alloc_domheap_page(d, MEMF_no_refcount); - if ( !pg ) - return -ENOMEM; - -- /* -- * Signal to shadow code that this page cannot be refcounted. This also -- * makes epte_get_entry_emt() recognize this page as "special". -- */ -- page_suppress_refcounting(pg); -+ if ( !get_page_and_type(pg, d, PGT_writable_page) ) -+ { -+ /* -+ * The domain can't possibly know about this page yet, so failure -+ * here is a clear indication of something fishy going on. -+ */ -+ domain_crash(d); -+ return -ENODATA; -+ } - - mfn = page_to_mfn(pg); - clear_domain_page(mfn); -- apic_access_mfn = mfn; -+ d->arch.hvm.vmx.apic_access_mfn = mfn; - - return 0; - } - -+static void vmx_free_vlapic_mapping(struct domain *d) -+{ -+ mfn_t mfn = d->arch.hvm.vmx.apic_access_mfn; -+ -+ d->arch.hvm.vmx.apic_access_mfn = _mfn(0); -+ if ( !mfn_eq(mfn, _mfn(0)) ) -+ { -+ struct page_info *pg = mfn_to_page(mfn); -+ -+ put_page_alloc_ref(pg); -+ put_page_and_type(pg); -+ } -+} -+ - static void vmx_install_vlapic_mapping(struct vcpu *v) - { -+ mfn_t apic_access_mfn = v->domain->arch.hvm.vmx.apic_access_mfn; - paddr_t virt_page_ma, apic_page_ma; - -- if ( !has_vlapic(v->domain) || mfn_eq(apic_access_mfn, INVALID_MFN) ) -+ if ( mfn_eq(apic_access_mfn, _mfn(0)) ) - return; - - ASSERT(cpu_has_vmx_virtualize_apic_accesses); -diff --git a/xen/arch/x86/mm/shadow/set.c b/xen/arch/x86/mm/shadow/set.c -index 87e9c6eeb219..bd6c68b547c9 100644 ---- a/xen/arch/x86/mm/shadow/set.c -+++ b/xen/arch/x86/mm/shadow/set.c -@@ -101,14 +101,6 @@ shadow_get_page_from_l1e(shadow_l1e_t sl1e, struct domain *d, p2m_type_t type) - owner = page_get_owner(pg); - } - -- /* -- * Check whether refcounting is suppressed on this page. For example, -- * VMX'es APIC access MFN is just a surrogate page. It doesn't actually -- * get accessed, and hence there's no need to refcount it. -- */ -- if ( pg && page_refcounting_suppressed(pg) ) -- return 0; -- - if ( owner == dom_io ) - owner = NULL; - -diff --git a/xen/arch/x86/mm/shadow/types.h b/xen/arch/x86/mm/shadow/types.h -index 6970e7d6ea4a..814a4018535a 100644 ---- a/xen/arch/x86/mm/shadow/types.h -+++ b/xen/arch/x86/mm/shadow/types.h -@@ -276,16 +276,9 @@ int shadow_set_l4e(struct domain *d, shadow_l4e_t *sl4e, - static void inline - shadow_put_page_from_l1e(shadow_l1e_t sl1e, struct domain *d) - { -- mfn_t mfn = shadow_l1e_get_mfn(sl1e); -- - if ( !shadow_mode_refcounts(d) ) - return; - -- if ( mfn_valid(mfn) && -- /* See the respective comment in shadow_get_page_from_l1e(). */ -- page_refcounting_suppressed(mfn_to_page(mfn)) ) -- return; -- - put_page_from_l1e(sl1e, d); - } - -diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h b/xen/include/asm-x86/hvm/vmx/vmcs.h -index 03c9ccf627ab..8073af323b96 100644 ---- a/xen/include/asm-x86/hvm/vmx/vmcs.h -+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h -@@ -58,6 +58,7 @@ struct ept_data { - #define _VMX_DOMAIN_PML_ENABLED 0 - #define VMX_DOMAIN_PML_ENABLED (1ul << _VMX_DOMAIN_PML_ENABLED) - struct vmx_domain { -+ mfn_t apic_access_mfn; - /* VMX_DOMAIN_* */ - unsigned int status; - -diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h -index 7bdf9c2290d8..e1bcea57a8f5 100644 ---- a/xen/include/asm-x86/mm.h -+++ b/xen/include/asm-x86/mm.h -@@ -83,7 +83,7 @@ - #define PGC_state_offlined PG_mask(2, 6) - #define PGC_state_free PG_mask(3, 6) - #define page_state_is(pg, st) (((pg)->count_info&PGC_state) == PGC_state_##st) --/* Page is not reference counted (see below for caveats) */ -+/* Page is not reference counted */ - #define _PGC_extra PG_shift(7) - #define PGC_extra PG_mask(1, 7) - -@@ -375,24 +375,6 @@ void zap_ro_mpt(mfn_t mfn); - - bool is_iomem_page(mfn_t mfn); - --/* -- * Pages with no owner which may get passed to functions wanting to -- * refcount them can be marked PGC_extra to bypass this refcounting (which -- * would fail due to the lack of an owner). -- * -- * (For pages with owner PGC_extra has different meaning.) -- */ --static inline void page_suppress_refcounting(struct page_info *pg) --{ -- ASSERT(!page_get_owner(pg)); -- pg->count_info |= PGC_extra; --} -- --static inline bool page_refcounting_suppressed(const struct page_info *pg) --{ -- return !page_get_owner(pg) && (pg->count_info & PGC_extra); --} -- - struct platform_bad_page { - unsigned long mfn; - unsigned int order; --- -2.37.4 - diff --git a/0044-VT-d-constrain-IGD-check.patch b/0044-VT-d-constrain-IGD-check.patch new file mode 100644 index 0000000..13ca74e --- /dev/null +++ b/0044-VT-d-constrain-IGD-check.patch @@ -0,0 +1,44 @@ +From 4d42cc4d25c35ca381370a1fa0b45350723d1308 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Tue, 21 Mar 2023 13:52:20 +0100 +Subject: [PATCH 44/61] VT-d: constrain IGD check + +Marking a DRHD as controlling an IGD isn't very sensible without +checking that at the very least it's a graphics device that lives at +0000:00:02.0. Re-use the reading of the class-code to control both the +clearing of "gfx_only" and the setting of "igd_drhd_address". + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Kevin Tian <kevin.tian@intel.com> +master commit: f8c4317295fa1cde1a81779b7e362651c084efb8 +master date: 2023-03-14 10:44:08 +0100 +--- + xen/drivers/passthrough/vtd/dmar.c | 9 +++------ + 1 file changed, 3 insertions(+), 6 deletions(-) + +diff --git a/xen/drivers/passthrough/vtd/dmar.c b/xen/drivers/passthrough/vtd/dmar.c +index 33a12b2ae9..9ec49936b8 100644 +--- a/xen/drivers/passthrough/vtd/dmar.c ++++ b/xen/drivers/passthrough/vtd/dmar.c +@@ -391,15 +391,12 @@ static int __init acpi_parse_dev_scope( + + if ( drhd ) + { +- if ( (seg == 0) && (bus == 0) && (path->dev == 2) && +- (path->fn == 0) ) +- igd_drhd_address = drhd->address; +- +- if ( gfx_only && +- pci_conf_read8(PCI_SBDF(seg, bus, path->dev, path->fn), ++ if ( pci_conf_read8(PCI_SBDF(seg, bus, path->dev, path->fn), + PCI_CLASS_DEVICE + 1) != 0x03 + /* PCI_BASE_CLASS_DISPLAY */ ) + gfx_only = false; ++ else if ( !seg && !bus && path->dev == 2 && !path->fn ) ++ igd_drhd_address = drhd->address; + } + + break; +-- +2.40.0 + diff --git a/0044-tools-xenstore-create_node-Don-t-defer-work-to-undo-.patch b/0044-tools-xenstore-create_node-Don-t-defer-work-to-undo-.patch deleted file mode 100644 index 8b9ff53..0000000 --- a/0044-tools-xenstore-create_node-Don-t-defer-work-to-undo-.patch +++ /dev/null @@ -1,120 +0,0 @@ -From 28ea39a4eb476f9105e1021bef1367c075feaa0b Mon Sep 17 00:00:00 2001 -From: Julien Grall <jgrall@amazon.com> -Date: Tue, 13 Sep 2022 07:35:06 +0200 -Subject: [PATCH 44/87] tools/xenstore: create_node: Don't defer work to undo - any changes on failure - -XSA-115 extended destroy_node() to update the node accounting for the -connection. The implementation is assuming the connection is the parent -of the node, however all the nodes are allocated using a separate context -(see process_message()). This will result to crash (or corrupt) xenstored -as the pointer is wrongly used. - -In case of an error, any changes to the database or update to the -accounting will now be reverted in create_node() by calling directly -destroy_node(). This has the nice advantage to remove the loop to unset -the destructors in case of success. - -Take the opportunity to free the nodes right now as they are not -going to be reachable (the function returns NULL) and are just wasting -resources. - -This is XSA-414 / CVE-2022-42309. - -Fixes: 0bfb2101f243 ("tools/xenstore: fix node accounting after failed node creation") -Signed-off-by: Julien Grall <jgrall@amazon.com> -Reviewed-by: Juergen Gross <jgross@suse.com> -(cherry picked from commit 1cd3cc7ea27cda7640a8d895e09617b61c265697) ---- - tools/xenstore/xenstored_core.c | 47 ++++++++++++++++++++++----------- - 1 file changed, 32 insertions(+), 15 deletions(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 0c8ee276f837..29947c3020c3 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -1088,9 +1088,8 @@ nomem: - return NULL; - } - --static int destroy_node(void *_node) -+static int destroy_node(struct connection *conn, struct node *node) - { -- struct node *node = _node; - TDB_DATA key; - - if (streq(node->name, "/")) -@@ -1099,7 +1098,7 @@ static int destroy_node(void *_node) - set_tdb_key(node->name, &key); - tdb_delete(tdb_ctx, key); - -- domain_entry_dec(talloc_parent(node), node); -+ domain_entry_dec(conn, node); - - return 0; - } -@@ -1108,7 +1107,8 @@ static struct node *create_node(struct connection *conn, const void *ctx, - const char *name, - void *data, unsigned int datalen) - { -- struct node *node, *i; -+ struct node *node, *i, *j; -+ int ret; - - node = construct_node(conn, ctx, name); - if (!node) -@@ -1130,23 +1130,40 @@ static struct node *create_node(struct connection *conn, const void *ctx, - /* i->parent is set for each new node, so check quota. */ - if (i->parent && - domain_entry(conn) >= quota_nb_entry_per_domain) { -- errno = ENOSPC; -- return NULL; -+ ret = ENOSPC; -+ goto err; - } -- if (write_node(conn, i, false)) -- return NULL; - -- /* Account for new node, set destructor for error case. */ -- if (i->parent) { -+ ret = write_node(conn, i, false); -+ if (ret) -+ goto err; -+ -+ /* Account for new node */ -+ if (i->parent) - domain_entry_inc(conn, i); -- talloc_set_destructor(i, destroy_node); -- } - } - -- /* OK, now remove destructors so they stay around */ -- for (i = node; i->parent; i = i->parent) -- talloc_set_destructor(i, NULL); - return node; -+ -+err: -+ /* -+ * We failed to update TDB for some of the nodes. Undo any work that -+ * have already been done. -+ */ -+ for (j = node; j != i; j = j->parent) -+ destroy_node(conn, j); -+ -+ /* We don't need to keep the nodes around, so free them. */ -+ i = node; -+ while (i) { -+ j = i; -+ i = i->parent; -+ talloc_free(j); -+ } -+ -+ errno = ret; -+ -+ return NULL; - } - - /* path, data... */ --- -2.37.4 - diff --git a/0045-bunzip-work-around-gcc13-warning.patch b/0045-bunzip-work-around-gcc13-warning.patch new file mode 100644 index 0000000..9b26011 --- /dev/null +++ b/0045-bunzip-work-around-gcc13-warning.patch @@ -0,0 +1,42 @@ +From 49116b2101094c3d6658928f03db88d035ba97be Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Tue, 21 Mar 2023 13:52:58 +0100 +Subject: [PATCH 45/61] bunzip: work around gcc13 warning +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +While provable that length[0] is always initialized (because symCount +cannot be zero), upcoming gcc13 fails to recognize this and warns about +the unconditional use of the value immediately following the loop. + +See also https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106511. + +Reported-by: Martin Liška <martin.liska@suse.com> +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> +master commit: 402195e56de0aacf97e05c80ed367d464ca6938b +master date: 2023-03-14 10:45:28 +0100 +--- + xen/common/bunzip2.c | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/xen/common/bunzip2.c b/xen/common/bunzip2.c +index 2087cfbbed..5108e570ed 100644 +--- a/xen/common/bunzip2.c ++++ b/xen/common/bunzip2.c +@@ -233,6 +233,11 @@ static int __init get_next_block(struct bunzip_data *bd) + becomes negative, so an unsigned inequality catches + it.) */ + t = get_bits(bd, 5)-1; ++ /* GCC 13 has apparently improved use-before-set detection, but ++ it can't figure out that length[0] is always intialized by ++ virtue of symCount always being positive when making it here. ++ See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106511. */ ++ length[0] = 0; + for (i = 0; i < symCount; i++) { + for (;;) { + if (((unsigned)t) > (MAX_HUFCODE_BITS-1)) +-- +2.40.0 + diff --git a/0045-tools-xenstore-Fail-a-transaction-if-it-is-not-possi.patch b/0045-tools-xenstore-Fail-a-transaction-if-it-is-not-possi.patch deleted file mode 100644 index 4ca6c93..0000000 --- a/0045-tools-xenstore-Fail-a-transaction-if-it-is-not-possi.patch +++ /dev/null @@ -1,145 +0,0 @@ -From 427e86b48836a9511f57004ca367283cd85cd30f Mon Sep 17 00:00:00 2001 -From: Julien Grall <jgrall@amazon.com> -Date: Tue, 13 Sep 2022 07:35:06 +0200 -Subject: [PATCH 45/87] tools/xenstore: Fail a transaction if it is not - possible to create a node - -Commit f2bebf72c4d5 "xenstore: rework of transaction handling" moved -out from copying the entire database everytime a new transaction is -opened to track the list of nodes changed. - -The content of all the nodes accessed during a transaction will be -temporarily stored in TDB using a different key. - -The function create_node() may write/update multiple nodes if the child -doesn't exist. In case of a failure, the function will revert any -changes (this include any update to TDB). Unfortunately, the function -which reverts the changes (i.e. destroy_node()) will not use the correct -key to delete any update or even request the transaction to fail. - -This means that if a client decide to go ahead with committing the -transaction, orphan nodes will be created because they were not linked -to an existing node (create_node() will write the nodes backwards). - -Once some nodes have been partially updated in a transaction, it is not -easily possible to undo any changes. So rather than continuing and hit -weird issue while committing, it is much saner to fail the transaction. - -This will have an impact on any client that decides to commit even if it -can't write a node. Although, it is not clear why a normal client would -want to do that... - -Lastly, update destroy_node() to use the correct key for deleting the -node. Rather than recreating it (this will allocate memory and -therefore fail), stash the key in the structure node. - -This is XSA-415 / CVE-2022-42310. - -Signed-off-by: Julien Grall <jgrall@amazon.com> -Reviewed-by: Juergen Gross <jgross@suse.com> -(cherry picked from commit 5d71766bd1a4a3a8b2fe952ca2be80e02fe48f34) ---- - tools/xenstore/xenstored_core.c | 23 +++++++++++++++-------- - tools/xenstore/xenstored_core.h | 2 ++ - tools/xenstore/xenstored_transaction.c | 5 +++++ - tools/xenstore/xenstored_transaction.h | 3 +++ - 4 files changed, 25 insertions(+), 8 deletions(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 29947c3020c3..e9c9695fd16e 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -566,15 +566,17 @@ int write_node_raw(struct connection *conn, TDB_DATA *key, struct node *node, - return 0; - } - -+/* -+ * Write the node. If the node is written, caller can find the key used in -+ * node->key. This can later be used if the change needs to be reverted. -+ */ - static int write_node(struct connection *conn, struct node *node, - bool no_quota_check) - { -- TDB_DATA key; -- -- if (access_node(conn, node, NODE_ACCESS_WRITE, &key)) -+ if (access_node(conn, node, NODE_ACCESS_WRITE, &node->key)) - return errno; - -- return write_node_raw(conn, &key, node, no_quota_check); -+ return write_node_raw(conn, &node->key, node, no_quota_check); - } - - unsigned int perm_for_conn(struct connection *conn, -@@ -1090,16 +1092,21 @@ nomem: - - static int destroy_node(struct connection *conn, struct node *node) - { -- TDB_DATA key; -- - if (streq(node->name, "/")) - corrupt(NULL, "Destroying root node!"); - -- set_tdb_key(node->name, &key); -- tdb_delete(tdb_ctx, key); -+ tdb_delete(tdb_ctx, node->key); - - domain_entry_dec(conn, node); - -+ /* -+ * It is not possible to easily revert the changes in a transaction. -+ * So if the failure happens in a transaction, mark it as fail to -+ * prevent any commit. -+ */ -+ if ( conn->transaction ) -+ fail_transaction(conn->transaction); -+ - return 0; - } - -diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h -index 07d861d92499..0004fa848c83 100644 ---- a/tools/xenstore/xenstored_core.h -+++ b/tools/xenstore/xenstored_core.h -@@ -155,6 +155,8 @@ struct node_perms { - - struct node { - const char *name; -+ /* Key used to update TDB */ -+ TDB_DATA key; - - /* Parent (optional) */ - struct node *parent; -diff --git a/tools/xenstore/xenstored_transaction.c b/tools/xenstore/xenstored_transaction.c -index cd07fb0f218b..faf6c930e42a 100644 ---- a/tools/xenstore/xenstored_transaction.c -+++ b/tools/xenstore/xenstored_transaction.c -@@ -580,6 +580,11 @@ void transaction_entry_dec(struct transaction *trans, unsigned int domid) - list_add_tail(&d->list, &trans->changed_domains); - } - -+void fail_transaction(struct transaction *trans) -+{ -+ trans->fail = true; -+} -+ - void conn_delete_all_transactions(struct connection *conn) - { - struct transaction *trans; -diff --git a/tools/xenstore/xenstored_transaction.h b/tools/xenstore/xenstored_transaction.h -index 43a162bea3f3..14062730e3c9 100644 ---- a/tools/xenstore/xenstored_transaction.h -+++ b/tools/xenstore/xenstored_transaction.h -@@ -46,6 +46,9 @@ int access_node(struct connection *conn, struct node *node, - int transaction_prepend(struct connection *conn, const char *name, - TDB_DATA *key); - -+/* Mark the transaction as failed. This will prevent it to be committed. */ -+void fail_transaction(struct transaction *trans); -+ - void conn_delete_all_transactions(struct connection *conn); - int check_transactions(struct hashtable *hash); - --- -2.37.4 - diff --git a/0046-libacpi-fix-PCI-hotplug-AML.patch b/0046-libacpi-fix-PCI-hotplug-AML.patch new file mode 100644 index 0000000..b1c79f5 --- /dev/null +++ b/0046-libacpi-fix-PCI-hotplug-AML.patch @@ -0,0 +1,57 @@ +From 54102e428ba3f677904278479f8110c8eef6fedc Mon Sep 17 00:00:00 2001 +From: David Woodhouse <dwmw@amazon.co.uk> +Date: Tue, 21 Mar 2023 13:53:25 +0100 +Subject: [PATCH 46/61] libacpi: fix PCI hotplug AML + +The emulated PIIX3 uses a nybble for the status of each PCI function, +so the status for e.g. slot 0 functions 0 and 1 respectively can be +read as (\_GPE.PH00 & 0x0F), and (\_GPE.PH00 >> 0x04). + +The AML that Xen gives to a guest gets the operand order for the odd- +numbered functions the wrong way round, returning (0x04 >> \_GPE.PH00) +instead. + +As far as I can tell, this was the wrong way round in Xen from the +moment that PCI hotplug was first introduced in commit 83d82e6f35a8: + ++ ShiftRight (0x4, \_GPE.PH00, Local1) ++ Return (Local1) /* IN status as the _STA */ + +Or maybe there's bizarre AML operand ordering going on there, like +Intel's wrong-way-round assembler, and it only broke later when it was +changed to being generated? + +Either way, it's definitely wrong now, and instrumenting a Linux guest +shows that it correctly sees _STA being 0x00 in function 0 of an empty +slot, but then the loop in acpiphp_glue.c::get_slot_status() goes on to +look at function 1 and sees that _STA evaluates to 0x04. Thus reporting +an adapter is present in every slot in /sys/bus/pci/slots/* + +Quite why Linux wants to look for function 1 being physically present +when function 0 isn't... I don't want to think about right now. + +Fixes: 83d82e6f35a8 ("hvmloader: pass-through: multi-function PCI hot-plug") +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +master commit: b190af7d3e90f58da5f58044b8dea7261b8b483d +master date: 2023-03-20 17:12:34 +0100 +--- + tools/libacpi/mk_dsdt.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/libacpi/mk_dsdt.c b/tools/libacpi/mk_dsdt.c +index c5ba4c0b2f..250a50b7eb 100644 +--- a/tools/libacpi/mk_dsdt.c ++++ b/tools/libacpi/mk_dsdt.c +@@ -431,7 +431,7 @@ int main(int argc, char **argv) + stmt("Store", "0x89, \\_GPE.DPT2"); + } + if ( slot & 1 ) +- stmt("ShiftRight", "0x4, \\_GPE.PH%02X, Local1", slot & ~1); ++ stmt("ShiftRight", "\\_GPE.PH%02X, 0x04, Local1", slot & ~1); + else + stmt("And", "\\_GPE.PH%02X, 0x0f, Local1", slot & ~1); + stmt("Return", "Local1"); /* IN status as the _STA */ +-- +2.40.0 + diff --git a/0046-tools-xenstore-split-up-send_reply.patch b/0046-tools-xenstore-split-up-send_reply.patch deleted file mode 100644 index 7af249a..0000000 --- a/0046-tools-xenstore-split-up-send_reply.patch +++ /dev/null @@ -1,213 +0,0 @@ -From ce6aea73f6c4c90fab2500933b3a488e2f30334b Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:07 +0200 -Subject: [PATCH 46/87] tools/xenstore: split up send_reply() - -Today send_reply() is used for both, normal request replies and watch -events. - -Split it up into send_reply() and send_event(). This will be used to -add some event specific handling. - -add_event() can be merged into send_event(), removing the need for an -intermediate memory allocation. - -This is part of XSA-326. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit 9bfde319dbac2a1321898d2f75a3f075c3eb7b32) ---- - tools/xenstore/xenstored_core.c | 74 +++++++++++++++++++------------- - tools/xenstore/xenstored_core.h | 1 + - tools/xenstore/xenstored_watch.c | 39 +++-------------- - 3 files changed, 52 insertions(+), 62 deletions(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index e9c9695fd16e..249ad5ec6fb1 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -767,49 +767,32 @@ static void send_error(struct connection *conn, int error) - void send_reply(struct connection *conn, enum xsd_sockmsg_type type, - const void *data, unsigned int len) - { -- struct buffered_data *bdata; -+ struct buffered_data *bdata = conn->in; -+ -+ assert(type != XS_WATCH_EVENT); - - if ( len > XENSTORE_PAYLOAD_MAX ) { - send_error(conn, E2BIG); - return; - } - -- /* Replies reuse the request buffer, events need a new one. */ -- if (type != XS_WATCH_EVENT) { -- bdata = conn->in; -- /* Drop asynchronous responses, e.g. errors for watch events. */ -- if (!bdata) -- return; -- bdata->inhdr = true; -- bdata->used = 0; -- conn->in = NULL; -- } else { -- /* Message is a child of the connection for auto-cleanup. */ -- bdata = new_buffer(conn); -+ if (!bdata) -+ return; -+ bdata->inhdr = true; -+ bdata->used = 0; - -- /* -- * Allocation failure here is unfortunate: we have no way to -- * tell anybody about it. -- */ -- if (!bdata) -- return; -- } - if (len <= DEFAULT_BUFFER_SIZE) - bdata->buffer = bdata->default_buffer; -- else -+ else { - bdata->buffer = talloc_array(bdata, char, len); -- if (!bdata->buffer) { -- if (type == XS_WATCH_EVENT) { -- /* Same as above: no way to tell someone. */ -- talloc_free(bdata); -+ if (!bdata->buffer) { -+ send_error(conn, ENOMEM); - return; - } -- /* re-establish request buffer for sending ENOMEM. */ -- conn->in = bdata; -- send_error(conn, ENOMEM); -- return; - } - -+ conn->in = NULL; -+ - /* Update relevant header fields and fill in the message body. */ - bdata->hdr.msg.type = type; - bdata->hdr.msg.len = len; -@@ -817,8 +800,39 @@ void send_reply(struct connection *conn, enum xsd_sockmsg_type type, - - /* Queue for later transmission. */ - list_add_tail(&bdata->list, &conn->out_list); -+} - -- return; -+/* -+ * Send a watch event. -+ * As this is not directly related to the current command, errors can't be -+ * reported. -+ */ -+void send_event(struct connection *conn, const char *path, const char *token) -+{ -+ struct buffered_data *bdata; -+ unsigned int len; -+ -+ len = strlen(path) + 1 + strlen(token) + 1; -+ /* Don't try to send over-long events. */ -+ if (len > XENSTORE_PAYLOAD_MAX) -+ return; -+ -+ bdata = new_buffer(conn); -+ if (!bdata) -+ return; -+ -+ bdata->buffer = talloc_array(bdata, char, len); -+ if (!bdata->buffer) { -+ talloc_free(bdata); -+ return; -+ } -+ strcpy(bdata->buffer, path); -+ strcpy(bdata->buffer + strlen(path) + 1, token); -+ bdata->hdr.msg.type = XS_WATCH_EVENT; -+ bdata->hdr.msg.len = len; -+ -+ /* Queue for later transmission. */ -+ list_add_tail(&bdata->list, &conn->out_list); - } - - /* Some routines (write, mkdir, etc) just need a non-error return */ -diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h -index 0004fa848c83..9af9af4390bd 100644 ---- a/tools/xenstore/xenstored_core.h -+++ b/tools/xenstore/xenstored_core.h -@@ -187,6 +187,7 @@ unsigned int get_string(const struct buffered_data *data, unsigned int offset); - - void send_reply(struct connection *conn, enum xsd_sockmsg_type type, - const void *data, unsigned int len); -+void send_event(struct connection *conn, const char *path, const char *token); - - /* Some routines (write, mkdir, etc) just need a non-error return */ - void send_ack(struct connection *conn, enum xsd_sockmsg_type type); -diff --git a/tools/xenstore/xenstored_watch.c b/tools/xenstore/xenstored_watch.c -index aca0a71bada1..99a2c266b28a 100644 ---- a/tools/xenstore/xenstored_watch.c -+++ b/tools/xenstore/xenstored_watch.c -@@ -85,35 +85,6 @@ static const char *get_watch_path(const struct watch *watch, const char *name) - return path; - } - --/* -- * Send a watch event. -- * Temporary memory allocations are done with ctx. -- */ --static void add_event(struct connection *conn, -- const void *ctx, -- struct watch *watch, -- const char *name) --{ -- /* Data to send (node\0token\0). */ -- unsigned int len; -- char *data; -- -- name = get_watch_path(watch, name); -- -- len = strlen(name) + 1 + strlen(watch->token) + 1; -- /* Don't try to send over-long events. */ -- if (len > XENSTORE_PAYLOAD_MAX) -- return; -- -- data = talloc_array(ctx, char, len); -- if (!data) -- return; -- strcpy(data, name); -- strcpy(data + strlen(name) + 1, watch->token); -- send_reply(conn, XS_WATCH_EVENT, data, len); -- talloc_free(data); --} -- - /* - * Check permissions of a specific watch to fire: - * Either the node itself or its parent have to be readable by the connection -@@ -190,10 +161,14 @@ void fire_watches(struct connection *conn, const void *ctx, const char *name, - list_for_each_entry(watch, &i->watches, list) { - if (exact) { - if (streq(name, watch->node)) -- add_event(i, ctx, watch, name); -+ send_event(i, -+ get_watch_path(watch, name), -+ watch->token); - } else { - if (is_child(name, watch->node)) -- add_event(i, ctx, watch, name); -+ send_event(i, -+ get_watch_path(watch, name), -+ watch->token); - } - } - } -@@ -292,7 +267,7 @@ int do_watch(struct connection *conn, struct buffered_data *in) - send_ack(conn, XS_WATCH); - - /* We fire once up front: simplifies clients and restart. */ -- add_event(conn, in, watch, watch->node); -+ send_event(conn, get_watch_path(watch, watch->node), watch->token); - - return 0; - } --- -2.37.4 - diff --git a/0047-AMD-IOMMU-without-XT-x2APIC-needs-to-be-forced-into-.patch b/0047-AMD-IOMMU-without-XT-x2APIC-needs-to-be-forced-into-.patch new file mode 100644 index 0000000..54940ba --- /dev/null +++ b/0047-AMD-IOMMU-without-XT-x2APIC-needs-to-be-forced-into-.patch @@ -0,0 +1,42 @@ +From 8e9690a2252eda09537275a951ee0af0b3b330f2 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Fri, 31 Mar 2023 08:36:59 +0200 +Subject: [PATCH 47/61] AMD/IOMMU: without XT, x2APIC needs to be forced into + physical mode + +An earlier change with the same title (commit 1ba66a870eba) altered only +the path where x2apic_phys was already set to false (perhaps from the +command line). The same of course needs applying when the variable +wasn't modified yet from its initial value. + +Reported-by: Elliott Mitchell <ehem+xen@m5p.com> +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> +master commit: 0d2686f6b66b4b1b3c72c3525083b0ce02830054 +master date: 2023-03-21 09:23:25 +0100 +--- + xen/arch/x86/genapic/x2apic.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/xen/arch/x86/genapic/x2apic.c b/xen/arch/x86/genapic/x2apic.c +index 628b441da5..247364af58 100644 +--- a/xen/arch/x86/genapic/x2apic.c ++++ b/xen/arch/x86/genapic/x2apic.c +@@ -239,11 +239,11 @@ const struct genapic *__init apic_x2apic_probe(void) + if ( x2apic_phys < 0 ) + { + /* +- * Force physical mode if there's no interrupt remapping support: The +- * ID in clustered mode requires a 32 bit destination field due to ++ * Force physical mode if there's no (full) interrupt remapping support: ++ * The ID in clustered mode requires a 32 bit destination field due to + * the usage of the high 16 bits to hold the cluster ID. + */ +- x2apic_phys = !iommu_intremap || ++ x2apic_phys = iommu_intremap != iommu_intremap_full || + (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL); + } + else if ( !x2apic_phys ) +-- +2.40.0 + diff --git a/0047-tools-xenstore-add-helpers-to-free-struct-buffered_d.patch b/0047-tools-xenstore-add-helpers-to-free-struct-buffered_d.patch deleted file mode 100644 index 96ba7bd..0000000 --- a/0047-tools-xenstore-add-helpers-to-free-struct-buffered_d.patch +++ /dev/null @@ -1,117 +0,0 @@ -From f8af1a27b00e373bfb5f5e61b14c51165a740fa4 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:07 +0200 -Subject: [PATCH 47/87] tools/xenstore: add helpers to free struct - buffered_data - -Add two helpers for freeing struct buffered_data: free_buffered_data() -for freeing one instance and conn_free_buffered_data() for freeing all -instances for a connection. - -This is avoiding duplicated code and will help later when more actions -are needed when freeing a struct buffered_data. - -This is part of XSA-326. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit ead062a68a9c201a95488e84750a70a107f7b317) ---- - tools/xenstore/xenstored_core.c | 26 +++++++++++++++++--------- - tools/xenstore/xenstored_core.h | 2 ++ - tools/xenstore/xenstored_domain.c | 7 +------ - 3 files changed, 20 insertions(+), 15 deletions(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 249ad5ec6fb1..527a1ebdeded 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -211,6 +211,21 @@ void reopen_log(void) - } - } - -+static void free_buffered_data(struct buffered_data *out, -+ struct connection *conn) -+{ -+ list_del(&out->list); -+ talloc_free(out); -+} -+ -+void conn_free_buffered_data(struct connection *conn) -+{ -+ struct buffered_data *out; -+ -+ while ((out = list_top(&conn->out_list, struct buffered_data, list))) -+ free_buffered_data(out, conn); -+} -+ - static bool write_messages(struct connection *conn) - { - int ret; -@@ -254,8 +269,7 @@ static bool write_messages(struct connection *conn) - - trace_io(conn, out, 1); - -- list_del(&out->list); -- talloc_free(out); -+ free_buffered_data(out, conn); - - return true; - } -@@ -1506,18 +1520,12 @@ static struct { - */ - void ignore_connection(struct connection *conn) - { -- struct buffered_data *out, *tmp; -- - trace("CONN %p ignored\n", conn); - - conn->is_ignored = true; - conn_delete_all_watches(conn); - conn_delete_all_transactions(conn); -- -- list_for_each_entry_safe(out, tmp, &conn->out_list, list) { -- list_del(&out->list); -- talloc_free(out); -- } -+ conn_free_buffered_data(conn); - - talloc_free(conn->in); - conn->in = NULL; -diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h -index 9af9af4390bd..e7ee87825c3b 100644 ---- a/tools/xenstore/xenstored_core.h -+++ b/tools/xenstore/xenstored_core.h -@@ -276,6 +276,8 @@ int remember_string(struct hashtable *hash, const char *str); - - void set_tdb_key(const char *name, TDB_DATA *key); - -+void conn_free_buffered_data(struct connection *conn); -+ - const char *dump_state_global(FILE *fp); - const char *dump_state_buffered_data(FILE *fp, const struct connection *c, - struct xs_state_connection *sc); -diff --git a/tools/xenstore/xenstored_domain.c b/tools/xenstore/xenstored_domain.c -index d03c7d93a9e7..93c4c1edcdd1 100644 ---- a/tools/xenstore/xenstored_domain.c -+++ b/tools/xenstore/xenstored_domain.c -@@ -411,15 +411,10 @@ static struct domain *find_domain_by_domid(unsigned int domid) - static void domain_conn_reset(struct domain *domain) - { - struct connection *conn = domain->conn; -- struct buffered_data *out; - - conn_delete_all_watches(conn); - conn_delete_all_transactions(conn); -- -- while ((out = list_top(&conn->out_list, struct buffered_data, list))) { -- list_del(&out->list); -- talloc_free(out); -- } -+ conn_free_buffered_data(conn); - - talloc_free(conn->in); - --- -2.37.4 - diff --git a/0048-VT-d-fix-iommu-no-igfx-if-the-IOMMU-scope-contains-f.patch b/0048-VT-d-fix-iommu-no-igfx-if-the-IOMMU-scope-contains-f.patch new file mode 100644 index 0000000..4c480b0 --- /dev/null +++ b/0048-VT-d-fix-iommu-no-igfx-if-the-IOMMU-scope-contains-f.patch @@ -0,0 +1,44 @@ +From 07e8f5b3d1300327a9f2e67b03dead0e2138b92f Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?= + <marmarek@invisiblethingslab.com> +Date: Fri, 31 Mar 2023 08:38:07 +0200 +Subject: [PATCH 48/61] VT-d: fix iommu=no-igfx if the IOMMU scope contains + fake device(s) +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +If the scope for IGD's IOMMU contains additional device that doesn't +actually exist, iommu=no-igfx would not disable that IOMMU. In this +particular case (Thinkpad x230) it included 00:02.1, but there is no +such device on this platform. Consider only existing devices for the +"gfx only" check as well as the establishing of IGD DRHD address +(underlying is_igd_drhd(), which is used to determine applicability of +two workarounds). + +Fixes: 2d7f191b392e ("VT-d: generalize and correct "iommu=no-igfx" handling") +Signed-off-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com> +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Kevin Tian <kevin.tian@intel.com> +master commit: 49de6749baa8d0addc3048defd4ef3e85cb135e9 +master date: 2023-03-23 09:16:41 +0100 +--- + xen/drivers/passthrough/vtd/dmar.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xen/drivers/passthrough/vtd/dmar.c b/xen/drivers/passthrough/vtd/dmar.c +index 9ec49936b8..bfec40f47d 100644 +--- a/xen/drivers/passthrough/vtd/dmar.c ++++ b/xen/drivers/passthrough/vtd/dmar.c +@@ -389,7 +389,7 @@ static int __init acpi_parse_dev_scope( + printk(VTDPREFIX " endpoint: %pp\n", + &PCI_SBDF(seg, bus, path->dev, path->fn)); + +- if ( drhd ) ++ if ( drhd && pci_device_detect(seg, bus, path->dev, path->fn) ) + { + if ( pci_conf_read8(PCI_SBDF(seg, bus, path->dev, path->fn), + PCI_CLASS_DEVICE + 1) != 0x03 +-- +2.40.0 + diff --git a/0048-tools-xenstore-reduce-number-of-watch-events.patch b/0048-tools-xenstore-reduce-number-of-watch-events.patch deleted file mode 100644 index 3a080fb..0000000 --- a/0048-tools-xenstore-reduce-number-of-watch-events.patch +++ /dev/null @@ -1,201 +0,0 @@ -From e26d6f4d1b389b859fb5a6570421e80e0213f92b Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:07 +0200 -Subject: [PATCH 48/87] tools/xenstore: reduce number of watch events - -When removing a watched node outside of a transaction, two watch events -are being produced instead of just a single one. - -When finalizing a transaction watch events can be generated for each -node which is being modified, even if outside a transaction such -modifications might not have resulted in a watch event. - -This happens e.g.: - -- for nodes which are only modified due to added/removed child entries -- for nodes being removed or created implicitly (e.g. creation of a/b/c - is implicitly creating a/b, resulting in watch events for a, a/b and - a/b/c instead of a/b/c only) - -Avoid these additional watch events, in order to reduce the needed -memory inside Xenstore for queueing them. - -This is being achieved by adding event flags to struct accessed_node -specifying whether an event should be triggered, and whether it should -be an exact match of the modified path. Both flags can be set from -fire_watches() instead of implying them only. - -This is part of XSA-326. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit 3a96013a3e17baa07410b1b9776225d1d9a74297) ---- - tools/xenstore/xenstored_core.c | 19 ++++++------ - tools/xenstore/xenstored_transaction.c | 41 +++++++++++++++++++++----- - tools/xenstore/xenstored_transaction.h | 3 ++ - tools/xenstore/xenstored_watch.c | 7 +++-- - 4 files changed, 51 insertions(+), 19 deletions(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 527a1ebdeded..bf2243873901 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -1295,7 +1295,7 @@ static void delete_child(struct connection *conn, - } - - static int delete_node(struct connection *conn, const void *ctx, -- struct node *parent, struct node *node) -+ struct node *parent, struct node *node, bool watch_exact) - { - char *name; - -@@ -1307,7 +1307,7 @@ static int delete_node(struct connection *conn, const void *ctx, - node->children); - child = name ? read_node(conn, node, name) : NULL; - if (child) { -- if (delete_node(conn, ctx, node, child)) -+ if (delete_node(conn, ctx, node, child, true)) - return errno; - } else { - trace("delete_node: Error deleting child '%s/%s'!\n", -@@ -1319,7 +1319,12 @@ static int delete_node(struct connection *conn, const void *ctx, - talloc_free(name); - } - -- fire_watches(conn, ctx, node->name, node, true, NULL); -+ /* -+ * Fire the watches now, when we can still see the node permissions. -+ * This fine as we are single threaded and the next possible read will -+ * be handled only after the node has been really removed. -+ */ -+ fire_watches(conn, ctx, node->name, node, watch_exact, NULL); - delete_node_single(conn, node); - delete_child(conn, parent, basename(node->name)); - talloc_free(node); -@@ -1345,13 +1350,7 @@ static int _rm(struct connection *conn, const void *ctx, struct node *node, - return (errno == ENOMEM) ? ENOMEM : EINVAL; - node->parent = parent; - -- /* -- * Fire the watches now, when we can still see the node permissions. -- * This fine as we are single threaded and the next possible read will -- * be handled only after the node has been really removed. -- */ -- fire_watches(conn, ctx, name, node, false, NULL); -- return delete_node(conn, ctx, parent, node); -+ return delete_node(conn, ctx, parent, node, false); - } - - -diff --git a/tools/xenstore/xenstored_transaction.c b/tools/xenstore/xenstored_transaction.c -index faf6c930e42a..54432907fc76 100644 ---- a/tools/xenstore/xenstored_transaction.c -+++ b/tools/xenstore/xenstored_transaction.c -@@ -130,6 +130,10 @@ struct accessed_node - - /* Transaction node in data base? */ - bool ta_node; -+ -+ /* Watch event flags. */ -+ bool fire_watch; -+ bool watch_exact; - }; - - struct changed_domain -@@ -323,6 +327,29 @@ err: - return ret; - } - -+/* -+ * A watch event should be fired for a node modified inside a transaction. -+ * Set the corresponding information. A non-exact event is replacing an exact -+ * one, but not the other way round. -+ */ -+void queue_watches(struct connection *conn, const char *name, bool watch_exact) -+{ -+ struct accessed_node *i; -+ -+ i = find_accessed_node(conn->transaction, name); -+ if (!i) { -+ conn->transaction->fail = true; -+ return; -+ } -+ -+ if (!i->fire_watch) { -+ i->fire_watch = true; -+ i->watch_exact = watch_exact; -+ } else if (!watch_exact) { -+ i->watch_exact = false; -+ } -+} -+ - /* - * Finalize transaction: - * Walk through accessed nodes and check generation against global data. -@@ -377,15 +404,15 @@ static int finalize_transaction(struct connection *conn, - ret = tdb_store(tdb_ctx, key, data, - TDB_REPLACE); - talloc_free(data.dptr); -- if (ret) -- goto err; -- fire_watches(conn, trans, i->node, NULL, false, -- i->perms.p ? &i->perms : NULL); - } else { -- fire_watches(conn, trans, i->node, NULL, false, -+ ret = tdb_delete(tdb_ctx, key); -+ } -+ if (ret) -+ goto err; -+ if (i->fire_watch) { -+ fire_watches(conn, trans, i->node, NULL, -+ i->watch_exact, - i->perms.p ? &i->perms : NULL); -- if (tdb_delete(tdb_ctx, key)) -- goto err; - } - } - -diff --git a/tools/xenstore/xenstored_transaction.h b/tools/xenstore/xenstored_transaction.h -index 14062730e3c9..0093cac807e3 100644 ---- a/tools/xenstore/xenstored_transaction.h -+++ b/tools/xenstore/xenstored_transaction.h -@@ -42,6 +42,9 @@ void transaction_entry_dec(struct transaction *trans, unsigned int domid); - int access_node(struct connection *conn, struct node *node, - enum node_access_type type, TDB_DATA *key); - -+/* Queue watches for a modified node. */ -+void queue_watches(struct connection *conn, const char *name, bool watch_exact); -+ - /* Prepend the transaction to name if appropriate. */ - int transaction_prepend(struct connection *conn, const char *name, - TDB_DATA *key); -diff --git a/tools/xenstore/xenstored_watch.c b/tools/xenstore/xenstored_watch.c -index 99a2c266b28a..205d9d8ea116 100644 ---- a/tools/xenstore/xenstored_watch.c -+++ b/tools/xenstore/xenstored_watch.c -@@ -29,6 +29,7 @@ - #include "xenstore_lib.h" - #include "utils.h" - #include "xenstored_domain.h" -+#include "xenstored_transaction.h" - - extern int quota_nb_watch_per_domain; - -@@ -143,9 +144,11 @@ void fire_watches(struct connection *conn, const void *ctx, const char *name, - struct connection *i; - struct watch *watch; - -- /* During transactions, don't fire watches. */ -- if (conn && conn->transaction) -+ /* During transactions, don't fire watches, but queue them. */ -+ if (conn && conn->transaction) { -+ queue_watches(conn, name, exact); - return; -+ } - - /* Create an event for each watch. */ - list_for_each_entry(i, &connections, list) { --- -2.37.4 - diff --git a/0049-tools-xenstore-let-unread-watch-events-time-out.patch b/0049-tools-xenstore-let-unread-watch-events-time-out.patch deleted file mode 100644 index dab0861..0000000 --- a/0049-tools-xenstore-let-unread-watch-events-time-out.patch +++ /dev/null @@ -1,309 +0,0 @@ -From d08cdf0b19daf948a6b9754e90de9bc304bcd262 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:07 +0200 -Subject: [PATCH 49/87] tools/xenstore: let unread watch events time out - -A future modification will limit the number of outstanding requests -for a domain, where "outstanding" means that the response of the -request or any resulting watch event hasn't been consumed yet. - -In order to avoid a malicious guest being capable to block other guests -by not reading watch events, add a timeout for watch events. In case a -watch event hasn't been consumed after this timeout, it is being -deleted. Set the default timeout to 20 seconds (a random value being -not too high). - -In order to support to specify other timeout values in future, use a -generic command line option for that purpose: - ---timeout|-w watch-event=<seconds> - -This is part of XSA-326 / CVE-2022-42311. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit 5285dcb1a5c01695c11e6397c95d906b5e765c98) ---- - tools/xenstore/xenstored_core.c | 133 +++++++++++++++++++++++++++++++- - tools/xenstore/xenstored_core.h | 6 ++ - 2 files changed, 138 insertions(+), 1 deletion(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index bf2243873901..45244c021cd3 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -108,6 +108,8 @@ int quota_max_transaction = 10; - int quota_nb_perms_per_node = 5; - int quota_max_path_len = XENSTORE_REL_PATH_MAX; - -+unsigned int timeout_watch_event_msec = 20000; -+ - void trace(const char *fmt, ...) - { - va_list arglist; -@@ -211,19 +213,92 @@ void reopen_log(void) - } - } - -+static uint64_t get_now_msec(void) -+{ -+ struct timespec now_ts; -+ -+ if (clock_gettime(CLOCK_MONOTONIC, &now_ts)) -+ barf_perror("Could not find time (clock_gettime failed)"); -+ -+ return now_ts.tv_sec * 1000 + now_ts.tv_nsec / 1000000; -+} -+ - static void free_buffered_data(struct buffered_data *out, - struct connection *conn) - { -+ struct buffered_data *req; -+ - list_del(&out->list); -+ -+ /* -+ * Update conn->timeout_msec with the next found timeout value in the -+ * queued pending requests. -+ */ -+ if (out->timeout_msec) { -+ conn->timeout_msec = 0; -+ list_for_each_entry(req, &conn->out_list, list) { -+ if (req->timeout_msec) { -+ conn->timeout_msec = req->timeout_msec; -+ break; -+ } -+ } -+ } -+ - talloc_free(out); - } - -+static void check_event_timeout(struct connection *conn, uint64_t msecs, -+ int *ptimeout) -+{ -+ uint64_t delta; -+ struct buffered_data *out, *tmp; -+ -+ if (!conn->timeout_msec) -+ return; -+ -+ delta = conn->timeout_msec - msecs; -+ if (conn->timeout_msec <= msecs) { -+ delta = 0; -+ list_for_each_entry_safe(out, tmp, &conn->out_list, list) { -+ /* -+ * Only look at buffers with timeout and no data -+ * already written to the ring. -+ */ -+ if (out->timeout_msec && out->inhdr && !out->used) { -+ if (out->timeout_msec > msecs) { -+ conn->timeout_msec = out->timeout_msec; -+ delta = conn->timeout_msec - msecs; -+ break; -+ } -+ -+ /* -+ * Free out without updating conn->timeout_msec, -+ * as the update is done in this loop already. -+ */ -+ out->timeout_msec = 0; -+ trace("watch event path %s for domain %u timed out\n", -+ out->buffer, conn->id); -+ free_buffered_data(out, conn); -+ } -+ } -+ if (!delta) { -+ conn->timeout_msec = 0; -+ return; -+ } -+ } -+ -+ if (*ptimeout == -1 || *ptimeout > delta) -+ *ptimeout = delta; -+} -+ - void conn_free_buffered_data(struct connection *conn) - { - struct buffered_data *out; - - while ((out = list_top(&conn->out_list, struct buffered_data, list))) - free_buffered_data(out, conn); -+ -+ conn->timeout_msec = 0; - } - - static bool write_messages(struct connection *conn) -@@ -411,6 +486,7 @@ static void initialize_fds(int *p_sock_pollfd_idx, int *ptimeout) - { - struct connection *conn; - struct wrl_timestampt now; -+ uint64_t msecs; - - if (fds) - memset(fds, 0, sizeof(struct pollfd) * current_array_size); -@@ -431,10 +507,12 @@ static void initialize_fds(int *p_sock_pollfd_idx, int *ptimeout) - - wrl_gettime_now(&now); - wrl_log_periodic(now); -+ msecs = get_now_msec(); - - list_for_each_entry(conn, &connections, list) { - if (conn->domain) { - wrl_check_timeout(conn->domain, now, ptimeout); -+ check_event_timeout(conn, msecs, ptimeout); - if (conn_can_read(conn) || - (conn_can_write(conn) && - !list_empty(&conn->out_list))) -@@ -794,6 +872,7 @@ void send_reply(struct connection *conn, enum xsd_sockmsg_type type, - return; - bdata->inhdr = true; - bdata->used = 0; -+ bdata->timeout_msec = 0; - - if (len <= DEFAULT_BUFFER_SIZE) - bdata->buffer = bdata->default_buffer; -@@ -845,6 +924,12 @@ void send_event(struct connection *conn, const char *path, const char *token) - bdata->hdr.msg.type = XS_WATCH_EVENT; - bdata->hdr.msg.len = len; - -+ if (timeout_watch_event_msec && domain_is_unprivileged(conn)) { -+ bdata->timeout_msec = get_now_msec() + timeout_watch_event_msec; -+ if (!conn->timeout_msec) -+ conn->timeout_msec = bdata->timeout_msec; -+ } -+ - /* Queue for later transmission. */ - list_add_tail(&bdata->list, &conn->out_list); - } -@@ -2201,6 +2286,9 @@ static void usage(void) - " -t, --transaction <nb> limit the number of transaction allowed per domain,\n" - " -A, --perm-nb <nb> limit the number of permissions per node,\n" - " -M, --path-max <chars> limit the allowed Xenstore node path length,\n" -+" -w, --timeout <what>=<seconds> set the timeout in seconds for <what>,\n" -+" allowed timeout candidates are:\n" -+" watch-event: time a watch-event is kept pending\n" - " -R, --no-recovery to request that no recovery should be attempted when\n" - " the store is corrupted (debug only),\n" - " -I, --internal-db store database in memory, not on disk\n" -@@ -2223,6 +2311,7 @@ static struct option options[] = { - { "transaction", 1, NULL, 't' }, - { "perm-nb", 1, NULL, 'A' }, - { "path-max", 1, NULL, 'M' }, -+ { "timeout", 1, NULL, 'w' }, - { "no-recovery", 0, NULL, 'R' }, - { "internal-db", 0, NULL, 'I' }, - { "verbose", 0, NULL, 'V' }, -@@ -2236,6 +2325,39 @@ int dom0_domid = 0; - int dom0_event = 0; - int priv_domid = 0; - -+static int get_optval_int(const char *arg) -+{ -+ char *end; -+ long val; -+ -+ val = strtol(arg, &end, 10); -+ if (!*arg || *end || val < 0 || val > INT_MAX) -+ barf("invalid parameter value \"%s\"\n", arg); -+ -+ return val; -+} -+ -+static bool what_matches(const char *arg, const char *what) -+{ -+ unsigned int what_len = strlen(what); -+ -+ return !strncmp(arg, what, what_len) && arg[what_len] == '='; -+} -+ -+static void set_timeout(const char *arg) -+{ -+ const char *eq = strchr(arg, '='); -+ int val; -+ -+ if (!eq) -+ barf("quotas must be specified via <what>=<seconds>\n"); -+ val = get_optval_int(eq + 1); -+ if (what_matches(arg, "watch-event")) -+ timeout_watch_event_msec = val * 1000; -+ else -+ barf("unknown timeout \"%s\"\n", arg); -+} -+ - int main(int argc, char *argv[]) - { - int opt; -@@ -2250,7 +2372,7 @@ int main(int argc, char *argv[]) - orig_argc = argc; - orig_argv = argv; - -- while ((opt = getopt_long(argc, argv, "DE:F:HNPS:t:A:M:T:RVW:U", options, -+ while ((opt = getopt_long(argc, argv, "DE:F:HNPS:t:A:M:T:RVW:w:U", options, - NULL)) != -1) { - switch (opt) { - case 'D': -@@ -2300,6 +2422,9 @@ int main(int argc, char *argv[]) - quota_max_path_len = min(XENSTORE_REL_PATH_MAX, - quota_max_path_len); - break; -+ case 'w': -+ set_timeout(optarg); -+ break; - case 'e': - dom0_event = strtol(optarg, NULL, 10); - break; -@@ -2741,6 +2866,12 @@ static void add_buffered_data(struct buffered_data *bdata, - barf("error restoring buffered data"); - - memcpy(bdata->buffer, data, len); -+ if (bdata->hdr.msg.type == XS_WATCH_EVENT && timeout_watch_event_msec && -+ domain_is_unprivileged(conn)) { -+ bdata->timeout_msec = get_now_msec() + timeout_watch_event_msec; -+ if (!conn->timeout_msec) -+ conn->timeout_msec = bdata->timeout_msec; -+ } - - /* Queue for later transmission. */ - list_add_tail(&bdata->list, &conn->out_list); -diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h -index e7ee87825c3b..8a81fc693f01 100644 ---- a/tools/xenstore/xenstored_core.h -+++ b/tools/xenstore/xenstored_core.h -@@ -27,6 +27,7 @@ - #include <fcntl.h> - #include <stdbool.h> - #include <stdint.h> -+#include <time.h> - #include <errno.h> - - #include "xenstore_lib.h" -@@ -67,6 +68,8 @@ struct buffered_data - char raw[sizeof(struct xsd_sockmsg)]; - } hdr; - -+ uint64_t timeout_msec; -+ - /* The actual data. */ - char *buffer; - char default_buffer[DEFAULT_BUFFER_SIZE]; -@@ -118,6 +121,7 @@ struct connection - - /* Buffered output data */ - struct list_head out_list; -+ uint64_t timeout_msec; - - /* Transaction context for current request (NULL if none). */ - struct transaction *transaction; -@@ -244,6 +248,8 @@ extern int dom0_event; - extern int priv_domid; - extern int quota_nb_entry_per_domain; - -+extern unsigned int timeout_watch_event_msec; -+ - /* Map the kernel's xenstore page. */ - void *xenbus_map(void); - void unmap_xenbus(void *interface); --- -2.37.4 - diff --git a/0049-x86-shadow-fix-and-improve-sh_page_has_multiple_shad.patch b/0049-x86-shadow-fix-and-improve-sh_page_has_multiple_shad.patch new file mode 100644 index 0000000..0abf7e9 --- /dev/null +++ b/0049-x86-shadow-fix-and-improve-sh_page_has_multiple_shad.patch @@ -0,0 +1,47 @@ +From cab866ee62d860e9ff4abe701163972d4e9f896d Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Fri, 31 Mar 2023 08:38:42 +0200 +Subject: [PATCH 49/61] x86/shadow: fix and improve + sh_page_has_multiple_shadows() + +While no caller currently invokes the function without first making sure +there is at least one shadow [1], we'd better eliminate UB here: +find_first_set_bit() requires input to be non-zero to return a well- +defined result. + +Further, using find_first_set_bit() isn't very efficient in the first +place for the intended purpose. + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> + +[1] The function has exactly two uses, and both are from OOS code, which + is HVM-only. For HVM (but not for PV) sh_mfn_is_a_page_table(), + guarding the call to sh_unsync(), guarantees at least one shadow. + Hence even if sh_page_has_multiple_shadows() returned a bogus value + when invoked for a PV domain, the subsequent is_hvm_vcpu() and + oos_active checks (the former being redundant with the latter) will + compensate. (Arguably that oos_active check should come first, for + both clarity and efficiency reasons.) +master commit: 2896224a4e294652c33f487b603d20bd30955f21 +master date: 2023-03-24 11:07:08 +0100 +--- + xen/arch/x86/mm/shadow/private.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xen/arch/x86/mm/shadow/private.h b/xen/arch/x86/mm/shadow/private.h +index 738214f75e..762214f73c 100644 +--- a/xen/arch/x86/mm/shadow/private.h ++++ b/xen/arch/x86/mm/shadow/private.h +@@ -324,7 +324,7 @@ static inline int sh_page_has_multiple_shadows(struct page_info *pg) + return 0; + shadows = pg->shadow_flags & SHF_page_type_mask; + /* More than one type bit set in shadow-flags? */ +- return ( (shadows & ~(1UL << find_first_set_bit(shadows))) != 0 ); ++ return shadows && (shadows & (shadows - 1)); + } + + #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) +-- +2.40.0 + diff --git a/0050-tools-xenstore-limit-outstanding-requests.patch b/0050-tools-xenstore-limit-outstanding-requests.patch deleted file mode 100644 index bb10180..0000000 --- a/0050-tools-xenstore-limit-outstanding-requests.patch +++ /dev/null @@ -1,453 +0,0 @@ -From 49344fb86ff040bae1107e236592c2d4dc4607f3 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:08 +0200 -Subject: [PATCH 50/87] tools/xenstore: limit outstanding requests - -Add another quota for limiting the number of outstanding requests of a -guest. As the way to specify quotas on the command line is becoming -rather nasty, switch to a new scheme using [--quota|-Q] <what>=<val> -allowing to add more quotas in future easily. - -Set the default value to 20 (basically a random value not seeming to -be too high or too low). - -A request is said to be outstanding if any message generated by this -request (the direct response plus potential watch events) is not yet -completely stored into a ring buffer. The initial watch event sent as -a result of registering a watch is an exception. - -Note that across a live update the relation to buffered watch events -for other domains is lost. - -Use talloc_zero() for allocating the domain structure in order to have -all per-domain quota zeroed initially. - -This is part of XSA-326 / CVE-2022-42312. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Acked-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit 36de433a273f55d614c83b89c9a8972287a1e475) ---- - tools/xenstore/xenstored_core.c | 88 +++++++++++++++++++++++++++++-- - tools/xenstore/xenstored_core.h | 20 ++++++- - tools/xenstore/xenstored_domain.c | 38 ++++++++++--- - tools/xenstore/xenstored_domain.h | 3 ++ - tools/xenstore/xenstored_watch.c | 15 ++++-- - 5 files changed, 150 insertions(+), 14 deletions(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 45244c021cd3..488d540f3a32 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -107,6 +107,7 @@ int quota_max_entry_size = 2048; /* 2K */ - int quota_max_transaction = 10; - int quota_nb_perms_per_node = 5; - int quota_max_path_len = XENSTORE_REL_PATH_MAX; -+int quota_req_outstanding = 20; - - unsigned int timeout_watch_event_msec = 20000; - -@@ -223,12 +224,24 @@ static uint64_t get_now_msec(void) - return now_ts.tv_sec * 1000 + now_ts.tv_nsec / 1000000; - } - -+/* -+ * Remove a struct buffered_data from the list of outgoing data. -+ * A struct buffered_data related to a request having caused watch events to be -+ * sent is kept until all those events have been written out. -+ * Each watch event is referencing the related request via pend.req, while the -+ * number of watch events caused by a request is kept in pend.ref.event_cnt -+ * (those two cases are mutually exclusive, so the two fields can share memory -+ * via a union). -+ * The struct buffered_data is freed only if no related watch event is -+ * referencing it. The related return data can be freed right away. -+ */ - static void free_buffered_data(struct buffered_data *out, - struct connection *conn) - { - struct buffered_data *req; - - list_del(&out->list); -+ out->on_out_list = false; - - /* - * Update conn->timeout_msec with the next found timeout value in the -@@ -244,6 +257,30 @@ static void free_buffered_data(struct buffered_data *out, - } - } - -+ if (out->hdr.msg.type == XS_WATCH_EVENT) { -+ req = out->pend.req; -+ if (req) { -+ req->pend.ref.event_cnt--; -+ if (!req->pend.ref.event_cnt && !req->on_out_list) { -+ if (req->on_ref_list) { -+ domain_outstanding_domid_dec( -+ req->pend.ref.domid); -+ list_del(&req->list); -+ } -+ talloc_free(req); -+ } -+ } -+ } else if (out->pend.ref.event_cnt) { -+ /* Hang out off from conn. */ -+ talloc_steal(NULL, out); -+ if (out->buffer != out->default_buffer) -+ talloc_free(out->buffer); -+ list_add(&out->list, &conn->ref_list); -+ out->on_ref_list = true; -+ return; -+ } else -+ domain_outstanding_dec(conn); -+ - talloc_free(out); - } - -@@ -405,6 +442,7 @@ int delay_request(struct connection *conn, struct buffered_data *in, - static int destroy_conn(void *_conn) - { - struct connection *conn = _conn; -+ struct buffered_data *req; - - /* Flush outgoing if possible, but don't block. */ - if (!conn->domain) { -@@ -418,6 +456,11 @@ static int destroy_conn(void *_conn) - break; - close(conn->fd); - } -+ -+ conn_free_buffered_data(conn); -+ list_for_each_entry(req, &conn->ref_list, list) -+ req->on_ref_list = false; -+ - if (conn->target) - talloc_unlink(conn, conn->target); - list_del(&conn->list); -@@ -893,6 +936,8 @@ void send_reply(struct connection *conn, enum xsd_sockmsg_type type, - - /* Queue for later transmission. */ - list_add_tail(&bdata->list, &conn->out_list); -+ bdata->on_out_list = true; -+ domain_outstanding_inc(conn); - } - - /* -@@ -900,7 +945,8 @@ void send_reply(struct connection *conn, enum xsd_sockmsg_type type, - * As this is not directly related to the current command, errors can't be - * reported. - */ --void send_event(struct connection *conn, const char *path, const char *token) -+void send_event(struct buffered_data *req, struct connection *conn, -+ const char *path, const char *token) - { - struct buffered_data *bdata; - unsigned int len; -@@ -930,8 +976,13 @@ void send_event(struct connection *conn, const char *path, const char *token) - conn->timeout_msec = bdata->timeout_msec; - } - -+ bdata->pend.req = req; -+ if (req) -+ req->pend.ref.event_cnt++; -+ - /* Queue for later transmission. */ - list_add_tail(&bdata->list, &conn->out_list); -+ bdata->on_out_list = true; - } - - /* Some routines (write, mkdir, etc) just need a non-error return */ -@@ -1740,6 +1791,7 @@ static void handle_input(struct connection *conn) - return; - } - in = conn->in; -+ in->pend.ref.domid = conn->id; - - /* Not finished header yet? */ - if (in->inhdr) { -@@ -1808,6 +1860,7 @@ struct connection *new_connection(const struct interface_funcs *funcs) - new->is_stalled = false; - new->transaction_started = 0; - INIT_LIST_HEAD(&new->out_list); -+ INIT_LIST_HEAD(&new->ref_list); - INIT_LIST_HEAD(&new->watches); - INIT_LIST_HEAD(&new->transaction_list); - INIT_LIST_HEAD(&new->delayed); -@@ -2286,6 +2339,9 @@ static void usage(void) - " -t, --transaction <nb> limit the number of transaction allowed per domain,\n" - " -A, --perm-nb <nb> limit the number of permissions per node,\n" - " -M, --path-max <chars> limit the allowed Xenstore node path length,\n" -+" -Q, --quota <what>=<nb> set the quota <what> to the value <nb>, allowed\n" -+" quotas are:\n" -+" outstanding: number of outstanding requests\n" - " -w, --timeout <what>=<seconds> set the timeout in seconds for <what>,\n" - " allowed timeout candidates are:\n" - " watch-event: time a watch-event is kept pending\n" -@@ -2311,6 +2367,7 @@ static struct option options[] = { - { "transaction", 1, NULL, 't' }, - { "perm-nb", 1, NULL, 'A' }, - { "path-max", 1, NULL, 'M' }, -+ { "quota", 1, NULL, 'Q' }, - { "timeout", 1, NULL, 'w' }, - { "no-recovery", 0, NULL, 'R' }, - { "internal-db", 0, NULL, 'I' }, -@@ -2358,6 +2415,20 @@ static void set_timeout(const char *arg) - barf("unknown timeout \"%s\"\n", arg); - } - -+static void set_quota(const char *arg) -+{ -+ const char *eq = strchr(arg, '='); -+ int val; -+ -+ if (!eq) -+ barf("quotas must be specified via <what>=<nb>\n"); -+ val = get_optval_int(eq + 1); -+ if (what_matches(arg, "outstanding")) -+ quota_req_outstanding = val; -+ else -+ barf("unknown quota \"%s\"\n", arg); -+} -+ - int main(int argc, char *argv[]) - { - int opt; -@@ -2372,8 +2443,8 @@ int main(int argc, char *argv[]) - orig_argc = argc; - orig_argv = argv; - -- while ((opt = getopt_long(argc, argv, "DE:F:HNPS:t:A:M:T:RVW:w:U", options, -- NULL)) != -1) { -+ while ((opt = getopt_long(argc, argv, "DE:F:HNPS:t:A:M:Q:T:RVW:w:U", -+ options, NULL)) != -1) { - switch (opt) { - case 'D': - no_domain_init = true; -@@ -2422,6 +2493,9 @@ int main(int argc, char *argv[]) - quota_max_path_len = min(XENSTORE_REL_PATH_MAX, - quota_max_path_len); - break; -+ case 'Q': -+ set_quota(optarg); -+ break; - case 'w': - set_timeout(optarg); - break; -@@ -2875,6 +2949,14 @@ static void add_buffered_data(struct buffered_data *bdata, - - /* Queue for later transmission. */ - list_add_tail(&bdata->list, &conn->out_list); -+ bdata->on_out_list = true; -+ /* -+ * Watch events are never "outstanding", but the request causing them -+ * are instead kept "outstanding" until all watch events caused by that -+ * request have been delivered. -+ */ -+ if (bdata->hdr.msg.type != XS_WATCH_EVENT) -+ domain_outstanding_inc(conn); - } - - void read_state_buffered_data(const void *ctx, struct connection *conn, -diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h -index 8a81fc693f01..db09f463a657 100644 ---- a/tools/xenstore/xenstored_core.h -+++ b/tools/xenstore/xenstored_core.h -@@ -56,6 +56,8 @@ struct xs_state_connection; - struct buffered_data - { - struct list_head list; -+ bool on_out_list; -+ bool on_ref_list; - - /* Are we still doing the header? */ - bool inhdr; -@@ -63,6 +65,17 @@ struct buffered_data - /* How far are we? */ - unsigned int used; - -+ /* Outstanding request accounting. */ -+ union { -+ /* ref is being used for requests. */ -+ struct { -+ unsigned int event_cnt; /* # of outstanding events. */ -+ unsigned int domid; /* domid of request. */ -+ } ref; -+ /* req is being used for watch events. */ -+ struct buffered_data *req; /* request causing event. */ -+ } pend; -+ - union { - struct xsd_sockmsg msg; - char raw[sizeof(struct xsd_sockmsg)]; -@@ -123,6 +136,9 @@ struct connection - struct list_head out_list; - uint64_t timeout_msec; - -+ /* Referenced requests no longer pending. */ -+ struct list_head ref_list; -+ - /* Transaction context for current request (NULL if none). */ - struct transaction *transaction; - -@@ -191,7 +207,8 @@ unsigned int get_string(const struct buffered_data *data, unsigned int offset); - - void send_reply(struct connection *conn, enum xsd_sockmsg_type type, - const void *data, unsigned int len); --void send_event(struct connection *conn, const char *path, const char *token); -+void send_event(struct buffered_data *req, struct connection *conn, -+ const char *path, const char *token); - - /* Some routines (write, mkdir, etc) just need a non-error return */ - void send_ack(struct connection *conn, enum xsd_sockmsg_type type); -@@ -247,6 +264,7 @@ extern int dom0_domid; - extern int dom0_event; - extern int priv_domid; - extern int quota_nb_entry_per_domain; -+extern int quota_req_outstanding; - - extern unsigned int timeout_watch_event_msec; - -diff --git a/tools/xenstore/xenstored_domain.c b/tools/xenstore/xenstored_domain.c -index 93c4c1edcdd1..850085a92c76 100644 ---- a/tools/xenstore/xenstored_domain.c -+++ b/tools/xenstore/xenstored_domain.c -@@ -78,6 +78,9 @@ struct domain - /* number of watch for this domain */ - int nbwatch; - -+ /* Number of outstanding requests. */ -+ int nboutstanding; -+ - /* write rate limit */ - wrl_creditt wrl_credit; /* [ -wrl_config_writecost, +_dburst ] */ - struct wrl_timestampt wrl_timestamp; -@@ -183,8 +186,12 @@ static bool domain_can_read(struct connection *conn) - { - struct xenstore_domain_interface *intf = conn->domain->interface; - -- if (domain_is_unprivileged(conn) && conn->domain->wrl_credit < 0) -- return false; -+ if (domain_is_unprivileged(conn)) { -+ if (conn->domain->wrl_credit < 0) -+ return false; -+ if (conn->domain->nboutstanding >= quota_req_outstanding) -+ return false; -+ } - - return (intf->req_cons != intf->req_prod); - } -@@ -331,7 +338,7 @@ static struct domain *alloc_domain(const void *context, unsigned int domid) - { - struct domain *domain; - -- domain = talloc(context, struct domain); -+ domain = talloc_zero(context, struct domain); - if (!domain) { - errno = ENOMEM; - return NULL; -@@ -392,9 +399,6 @@ static int new_domain(struct domain *domain, int port, bool restore) - domain->conn->domain = domain; - domain->conn->id = domain->domid; - -- domain->nbentry = 0; -- domain->nbwatch = 0; -- - return 0; - } - -@@ -938,6 +942,28 @@ int domain_watch(struct connection *conn) - : 0; - } - -+void domain_outstanding_inc(struct connection *conn) -+{ -+ if (!conn || !conn->domain) -+ return; -+ conn->domain->nboutstanding++; -+} -+ -+void domain_outstanding_dec(struct connection *conn) -+{ -+ if (!conn || !conn->domain) -+ return; -+ conn->domain->nboutstanding--; -+} -+ -+void domain_outstanding_domid_dec(unsigned int domid) -+{ -+ struct domain *d = find_domain_by_domid(domid); -+ -+ if (d) -+ d->nboutstanding--; -+} -+ - static wrl_creditt wrl_config_writecost = WRL_FACTOR; - static wrl_creditt wrl_config_rate = WRL_RATE * WRL_FACTOR; - static wrl_creditt wrl_config_dburst = WRL_DBURST * WRL_FACTOR; -diff --git a/tools/xenstore/xenstored_domain.h b/tools/xenstore/xenstored_domain.h -index 1e929b8f8c6f..4f51b005291a 100644 ---- a/tools/xenstore/xenstored_domain.h -+++ b/tools/xenstore/xenstored_domain.h -@@ -64,6 +64,9 @@ int domain_entry(struct connection *conn); - void domain_watch_inc(struct connection *conn); - void domain_watch_dec(struct connection *conn); - int domain_watch(struct connection *conn); -+void domain_outstanding_inc(struct connection *conn); -+void domain_outstanding_dec(struct connection *conn); -+void domain_outstanding_domid_dec(unsigned int domid); - - /* Special node permission handling. */ - int set_perms_special(struct connection *conn, const char *name, -diff --git a/tools/xenstore/xenstored_watch.c b/tools/xenstore/xenstored_watch.c -index 205d9d8ea116..0755ffa375ba 100644 ---- a/tools/xenstore/xenstored_watch.c -+++ b/tools/xenstore/xenstored_watch.c -@@ -142,6 +142,7 @@ void fire_watches(struct connection *conn, const void *ctx, const char *name, - struct node *node, bool exact, struct node_perms *perms) - { - struct connection *i; -+ struct buffered_data *req; - struct watch *watch; - - /* During transactions, don't fire watches, but queue them. */ -@@ -150,6 +151,8 @@ void fire_watches(struct connection *conn, const void *ctx, const char *name, - return; - } - -+ req = domain_is_unprivileged(conn) ? conn->in : NULL; -+ - /* Create an event for each watch. */ - list_for_each_entry(i, &connections, list) { - /* introduce/release domain watches */ -@@ -164,12 +167,12 @@ void fire_watches(struct connection *conn, const void *ctx, const char *name, - list_for_each_entry(watch, &i->watches, list) { - if (exact) { - if (streq(name, watch->node)) -- send_event(i, -+ send_event(req, i, - get_watch_path(watch, name), - watch->token); - } else { - if (is_child(name, watch->node)) -- send_event(i, -+ send_event(req, i, - get_watch_path(watch, name), - watch->token); - } -@@ -269,8 +272,12 @@ int do_watch(struct connection *conn, struct buffered_data *in) - trace_create(watch, "watch"); - send_ack(conn, XS_WATCH); - -- /* We fire once up front: simplifies clients and restart. */ -- send_event(conn, get_watch_path(watch, watch->node), watch->token); -+ /* -+ * We fire once up front: simplifies clients and restart. -+ * This event will not be linked to the XS_WATCH request. -+ */ -+ send_event(NULL, conn, get_watch_path(watch, watch->node), -+ watch->token); - - return 0; - } --- -2.37.4 - diff --git a/0050-x86-nospec-Fix-evaluate_nospec-code-generation-under.patch b/0050-x86-nospec-Fix-evaluate_nospec-code-generation-under.patch new file mode 100644 index 0000000..14a8e14 --- /dev/null +++ b/0050-x86-nospec-Fix-evaluate_nospec-code-generation-under.patch @@ -0,0 +1,101 @@ +From 90320fd05991d7817cea85e1d45674b757abf03c Mon Sep 17 00:00:00 2001 +From: Andrew Cooper <andrew.cooper3@citrix.com> +Date: Fri, 31 Mar 2023 08:39:32 +0200 +Subject: [PATCH 50/61] x86/nospec: Fix evaluate_nospec() code generation under + Clang +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +It turns out that evaluate_nospec() code generation is not safe under Clang. +Given: + + void eval_nospec_test(int x) + { + if ( evaluate_nospec(x) ) + asm volatile ("nop #true" ::: "memory"); + else + asm volatile ("nop #false" ::: "memory"); + } + +Clang emits: + + <eval_nospec_test>: + 0f ae e8 lfence + 85 ff test %edi,%edi + 74 02 je <eval_nospec_test+0x9> + 90 nop + c3 ret + 90 nop + c3 ret + +which is not safe because the lfence has been hoisted above the conditional +jump. Clang concludes that both barrier_nospec_true()'s have identical side +effects and can safely be merged. + +Clang can be persuaded that the side effects are different if there are +different comments in the asm blocks. This is fragile, but no more fragile +that other aspects of this construct. + +Introduce barrier_nospec_false() with a separate internal comment to prevent +Clang merging it with barrier_nospec_true() despite the otherwise-identical +content. The generated code now becomes: + + <eval_nospec_test>: + 85 ff test %edi,%edi + 74 05 je <eval_nospec_test+0x9> + 0f ae e8 lfence + 90 nop + c3 ret + 0f ae e8 lfence + 90 nop + c3 ret + +which has the correct number of lfence's, and in the correct place. + +Link: https://github.com/llvm/llvm-project/issues/55084 +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +master commit: bc3c133841435829ba5c0a48427e2a77633502ab +master date: 2023-03-24 12:16:31 +0000 +--- + xen/include/asm-x86/nospec.h | 15 +++++++++++++-- + 1 file changed, 13 insertions(+), 2 deletions(-) + +diff --git a/xen/include/asm-x86/nospec.h b/xen/include/asm-x86/nospec.h +index 5312ae4c6f..7150e76b87 100644 +--- a/xen/include/asm-x86/nospec.h ++++ b/xen/include/asm-x86/nospec.h +@@ -10,15 +10,26 @@ + static always_inline bool barrier_nospec_true(void) + { + #ifdef CONFIG_SPECULATIVE_HARDEN_BRANCH +- alternative("lfence", "", X86_FEATURE_SC_NO_BRANCH_HARDEN); ++ alternative("lfence #nospec-true", "", X86_FEATURE_SC_NO_BRANCH_HARDEN); + #endif + return true; + } + ++static always_inline bool barrier_nospec_false(void) ++{ ++#ifdef CONFIG_SPECULATIVE_HARDEN_BRANCH ++ alternative("lfence #nospec-false", "", X86_FEATURE_SC_NO_BRANCH_HARDEN); ++#endif ++ return false; ++} ++ + /* Allow to protect evaluation of conditionals with respect to speculation */ + static always_inline bool evaluate_nospec(bool condition) + { +- return condition ? barrier_nospec_true() : !barrier_nospec_true(); ++ if ( condition ) ++ return barrier_nospec_true(); ++ else ++ return barrier_nospec_false(); + } + + /* Allow to block speculative execution in generic code */ +-- +2.40.0 + diff --git a/0051-tools-xenstore-don-t-buffer-multiple-identical-watch.patch b/0051-tools-xenstore-don-t-buffer-multiple-identical-watch.patch deleted file mode 100644 index 2c2dfd6..0000000 --- a/0051-tools-xenstore-don-t-buffer-multiple-identical-watch.patch +++ /dev/null @@ -1,93 +0,0 @@ -From b270ad4a7ebe3409337bf3730317af6977c38197 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:08 +0200 -Subject: [PATCH 51/87] tools/xenstore: don't buffer multiple identical watch - events - -A guest not reading its Xenstore response buffer fast enough might -pile up lots of Xenstore watch events buffered. Reduce the generated -load by dropping new events which already have an identical copy -pending. - -The special events "@..." are excluded from that handling as there are -known use cases where the handler is relying on each event to be sent -individually. - -This is part of XSA-326. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit b5c0bdb96d33e18c324c13d8e33c08732d77eaa2) ---- - tools/xenstore/xenstored_core.c | 20 +++++++++++++++++++- - tools/xenstore/xenstored_core.h | 3 +++ - 2 files changed, 22 insertions(+), 1 deletion(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 488d540f3a32..f1fa97b8cf50 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -916,6 +916,7 @@ void send_reply(struct connection *conn, enum xsd_sockmsg_type type, - bdata->inhdr = true; - bdata->used = 0; - bdata->timeout_msec = 0; -+ bdata->watch_event = false; - - if (len <= DEFAULT_BUFFER_SIZE) - bdata->buffer = bdata->default_buffer; -@@ -948,7 +949,7 @@ void send_reply(struct connection *conn, enum xsd_sockmsg_type type, - void send_event(struct buffered_data *req, struct connection *conn, - const char *path, const char *token) - { -- struct buffered_data *bdata; -+ struct buffered_data *bdata, *bd; - unsigned int len; - - len = strlen(path) + 1 + strlen(token) + 1; -@@ -970,12 +971,29 @@ void send_event(struct buffered_data *req, struct connection *conn, - bdata->hdr.msg.type = XS_WATCH_EVENT; - bdata->hdr.msg.len = len; - -+ /* -+ * Check whether an identical event is pending already. -+ * Special events are excluded from that check. -+ */ -+ if (path[0] != '@') { -+ list_for_each_entry(bd, &conn->out_list, list) { -+ if (bd->watch_event && bd->hdr.msg.len == len && -+ !memcmp(bdata->buffer, bd->buffer, len)) { -+ trace("dropping duplicate watch %s %s for domain %u\n", -+ path, token, conn->id); -+ talloc_free(bdata); -+ return; -+ } -+ } -+ } -+ - if (timeout_watch_event_msec && domain_is_unprivileged(conn)) { - bdata->timeout_msec = get_now_msec() + timeout_watch_event_msec; - if (!conn->timeout_msec) - conn->timeout_msec = bdata->timeout_msec; - } - -+ bdata->watch_event = true; - bdata->pend.req = req; - if (req) - req->pend.ref.event_cnt++; -diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h -index db09f463a657..b9b50e81c7b4 100644 ---- a/tools/xenstore/xenstored_core.h -+++ b/tools/xenstore/xenstored_core.h -@@ -62,6 +62,9 @@ struct buffered_data - /* Are we still doing the header? */ - bool inhdr; - -+ /* Is this a watch event? */ -+ bool watch_event; -+ - /* How far are we? */ - unsigned int used; - --- -2.37.4 - diff --git a/0051-x86-shadow-Fix-build-with-no-PG_log_dirty.patch b/0051-x86-shadow-Fix-build-with-no-PG_log_dirty.patch new file mode 100644 index 0000000..ef2a137 --- /dev/null +++ b/0051-x86-shadow-Fix-build-with-no-PG_log_dirty.patch @@ -0,0 +1,56 @@ +From 7e1fe95c79d55a1c1a65f71a078b8e31c69ffe94 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper <andrew.cooper3@citrix.com> +Date: Fri, 31 Mar 2023 08:39:49 +0200 +Subject: [PATCH 51/61] x86/shadow: Fix build with no PG_log_dirty + +Gitlab Randconfig found: + + arch/x86/mm/shadow/common.c: In function 'shadow_prealloc': + arch/x86/mm/shadow/common.c:1023:18: error: implicit declaration of function + 'paging_logdirty_levels'; did you mean 'paging_log_dirty_init'? [-Werror=implicit-function-declaration] + 1023 | count += paging_logdirty_levels(); + | ^~~~~~~~~~~~~~~~~~~~~~ + | paging_log_dirty_init + arch/x86/mm/shadow/common.c:1023:18: error: nested extern declaration of 'paging_logdirty_levels' [-Werror=nested-externs] + +The '#if PG_log_dirty' expression is currently SHADOW_PAGING && !HVM && +PV_SHIM_EXCLUSIVE. Move the declaration outside. + +Fixes: 33fb3a661223 ("x86/shadow: account for log-dirty mode when pre-allocating") +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +master commit: 6d14cb105b1c54ad7b4228d858ae85aa8a672bbd +master date: 2023-03-24 12:16:31 +0000 +--- + xen/include/asm-x86/paging.h | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/xen/include/asm-x86/paging.h b/xen/include/asm-x86/paging.h +index c6b429c691..43abaa5bd1 100644 +--- a/xen/include/asm-x86/paging.h ++++ b/xen/include/asm-x86/paging.h +@@ -154,6 +154,10 @@ struct paging_mode { + /***************************************************************************** + * Log dirty code */ + ++#define paging_logdirty_levels() \ ++ (DIV_ROUND_UP(PADDR_BITS - PAGE_SHIFT - (PAGE_SHIFT + 3), \ ++ PAGE_SHIFT - ilog2(sizeof(mfn_t))) + 1) ++ + #if PG_log_dirty + + /* get the dirty bitmap for a specific range of pfns */ +@@ -192,10 +196,6 @@ int paging_mfn_is_dirty(struct domain *d, mfn_t gmfn); + #define L4_LOGDIRTY_IDX(pfn) ((pfn_x(pfn) >> (PAGE_SHIFT + 3 + PAGETABLE_ORDER * 2)) & \ + (LOGDIRTY_NODE_ENTRIES-1)) + +-#define paging_logdirty_levels() \ +- (DIV_ROUND_UP(PADDR_BITS - PAGE_SHIFT - (PAGE_SHIFT + 3), \ +- PAGE_SHIFT - ilog2(sizeof(mfn_t))) + 1) +- + #ifdef CONFIG_HVM + /* VRAM dirty tracking support */ + struct sh_dirty_vram { +-- +2.40.0 + diff --git a/0052-tools-xenstore-fix-connection-id-usage.patch b/0052-tools-xenstore-fix-connection-id-usage.patch deleted file mode 100644 index 5eac10f..0000000 --- a/0052-tools-xenstore-fix-connection-id-usage.patch +++ /dev/null @@ -1,61 +0,0 @@ -From 787241f55216d34ca025c835c6a2096d7664d711 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:08 +0200 -Subject: [PATCH 52/87] tools/xenstore: fix connection->id usage - -Don't use conn->id for privilege checks, but domain_is_unprivileged(). - -This is part of XSA-326. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit 3047df38e1991510bc295e3e1bb6b6b6c4a97831) ---- - tools/xenstore/xenstored_control.c | 2 +- - tools/xenstore/xenstored_core.h | 2 +- - tools/xenstore/xenstored_transaction.c | 3 ++- - 3 files changed, 4 insertions(+), 3 deletions(-) - -diff --git a/tools/xenstore/xenstored_control.c b/tools/xenstore/xenstored_control.c -index 7b4300ef7777..adb8d51b043b 100644 ---- a/tools/xenstore/xenstored_control.c -+++ b/tools/xenstore/xenstored_control.c -@@ -891,7 +891,7 @@ int do_control(struct connection *conn, struct buffered_data *in) - unsigned int cmd, num, off; - char **vec = NULL; - -- if (conn->id != 0) -+ if (domain_is_unprivileged(conn)) - return EACCES; - - off = get_string(in, 0); -diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h -index b9b50e81c7b4..b1a70488b989 100644 ---- a/tools/xenstore/xenstored_core.h -+++ b/tools/xenstore/xenstored_core.h -@@ -123,7 +123,7 @@ struct connection - /* The index of pollfd in global pollfd array */ - int pollfd_idx; - -- /* Who am I? 0 for socket connections. */ -+ /* Who am I? Domid of connection. */ - unsigned int id; - - /* Is this connection ignored? */ -diff --git a/tools/xenstore/xenstored_transaction.c b/tools/xenstore/xenstored_transaction.c -index 54432907fc76..ee1b09031a3b 100644 ---- a/tools/xenstore/xenstored_transaction.c -+++ b/tools/xenstore/xenstored_transaction.c -@@ -477,7 +477,8 @@ int do_transaction_start(struct connection *conn, struct buffered_data *in) - if (conn->transaction) - return EBUSY; - -- if (conn->id && conn->transaction_started > quota_max_transaction) -+ if (domain_is_unprivileged(conn) && -+ conn->transaction_started > quota_max_transaction) - return ENOSPC; - - /* Attach transaction to input for autofree until it's complete */ --- -2.37.4 - diff --git a/0052-x86-vmx-Don-t-spuriously-crash-the-domain-when-INIT-.patch b/0052-x86-vmx-Don-t-spuriously-crash-the-domain-when-INIT-.patch new file mode 100644 index 0000000..c408fbb --- /dev/null +++ b/0052-x86-vmx-Don-t-spuriously-crash-the-domain-when-INIT-.patch @@ -0,0 +1,51 @@ +From b1022b65de59828d40d9d71cc734a42c1c30c972 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper <andrew.cooper3@citrix.com> +Date: Fri, 31 Mar 2023 08:40:27 +0200 +Subject: [PATCH 52/61] x86/vmx: Don't spuriously crash the domain when INIT is + received + +In VMX operation, the handling of INIT IPIs is changed. Instead of the CPU +resetting, the next VMEntry fails with EXIT_REASON_INIT. From the TXT spec, +the intent of this behaviour is so that an entity which cares can scrub +secrets from RAM before participating in an orderly shutdown. + +Right now, Xen's behaviour is that when an INIT arrives, the HVM VM which +schedules next is killed (citing an unknown VMExit), *and* we ignore the INIT +and continue blindly onwards anyway. + +This patch addresses only the first of these two problems by ignoring the INIT +and continuing without crashing the VM in question. + +The second wants addressing too, just as soon as we've figured out something +better to do... + +Discovered as collateral damage from when an AP triple faults on S3 resume on +Intel TigerLake platforms. + +Link: https://github.com/QubesOS/qubes-issues/issues/7283 +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Kevin Tian <kevin.tian@intel.com> +master commit: b1f11273d5a774cc88a3685c96c2e7cf6385e3b6 +master date: 2023-03-24 22:49:58 +0000 +--- + xen/arch/x86/hvm/vmx/vmx.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c +index c8a839cd5e..cebe46ef6a 100644 +--- a/xen/arch/x86/hvm/vmx/vmx.c ++++ b/xen/arch/x86/hvm/vmx/vmx.c +@@ -4002,6 +4002,10 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs) + case EXIT_REASON_MCE_DURING_VMENTRY: + do_machine_check(regs); + break; ++ ++ case EXIT_REASON_INIT: ++ printk(XENLOG_ERR "Error: INIT received - ignoring\n"); ++ return; /* Renter the guest without further processing */ + } + + /* Now enable interrupts so it's safe to take locks. */ +-- +2.40.0 + diff --git a/0053-tools-xenstore-simplify-and-fix-per-domain-node-acco.patch b/0053-tools-xenstore-simplify-and-fix-per-domain-node-acco.patch deleted file mode 100644 index 1bd3051..0000000 --- a/0053-tools-xenstore-simplify-and-fix-per-domain-node-acco.patch +++ /dev/null @@ -1,336 +0,0 @@ -From 717460e062dfe13a69cb01f518dd7b65d39376ef Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:08 +0200 -Subject: [PATCH 53/87] tools/xenstore: simplify and fix per domain node - accounting - -The accounting of nodes can be simplified now that each connection -holds the associated domid. - -Fix the node accounting to cover nodes created for a domain before it -has been introduced. This requires to react properly to an allocation -failure inside domain_entry_inc() by returning an error code. - -Especially in error paths the node accounting has to be fixed in some -cases. - -This is part of XSA-326 / CVE-2022-42313. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit dbef1f7482894c572d90cd73d99ed689c891e863) ---- - tools/xenstore/xenstored_core.c | 43 ++++++++-- - tools/xenstore/xenstored_domain.c | 105 ++++++++++++++++--------- - tools/xenstore/xenstored_domain.h | 4 +- - tools/xenstore/xenstored_transaction.c | 8 +- - 4 files changed, 109 insertions(+), 51 deletions(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index f1fa97b8cf50..692d863fce35 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -638,7 +638,7 @@ struct node *read_node(struct connection *conn, const void *ctx, - - /* Permissions are struct xs_permissions. */ - node->perms.p = hdr->perms; -- if (domain_adjust_node_perms(node)) { -+ if (domain_adjust_node_perms(conn, node)) { - talloc_free(node); - return NULL; - } -@@ -660,7 +660,7 @@ int write_node_raw(struct connection *conn, TDB_DATA *key, struct node *node, - void *p; - struct xs_tdb_record_hdr *hdr; - -- if (domain_adjust_node_perms(node)) -+ if (domain_adjust_node_perms(conn, node)) - return errno; - - data.dsize = sizeof(*hdr) -@@ -1272,13 +1272,17 @@ nomem: - return NULL; - } - --static int destroy_node(struct connection *conn, struct node *node) -+static void destroy_node_rm(struct node *node) - { - if (streq(node->name, "/")) - corrupt(NULL, "Destroying root node!"); - - tdb_delete(tdb_ctx, node->key); -+} - -+static int destroy_node(struct connection *conn, struct node *node) -+{ -+ destroy_node_rm(node); - domain_entry_dec(conn, node); - - /* -@@ -1328,8 +1332,12 @@ static struct node *create_node(struct connection *conn, const void *ctx, - goto err; - - /* Account for new node */ -- if (i->parent) -- domain_entry_inc(conn, i); -+ if (i->parent) { -+ if (domain_entry_inc(conn, i)) { -+ destroy_node_rm(i); -+ return NULL; -+ } -+ } - } - - return node; -@@ -1614,10 +1622,27 @@ static int do_set_perms(struct connection *conn, struct buffered_data *in) - old_perms = node->perms; - domain_entry_dec(conn, node); - node->perms = perms; -- domain_entry_inc(conn, node); -+ if (domain_entry_inc(conn, node)) { -+ node->perms = old_perms; -+ /* -+ * This should never fail because we had a reference on the -+ * domain before and Xenstored is single-threaded. -+ */ -+ domain_entry_inc(conn, node); -+ return ENOMEM; -+ } - -- if (write_node(conn, node, false)) -+ if (write_node(conn, node, false)) { -+ int saved_errno = errno; -+ -+ domain_entry_dec(conn, node); -+ node->perms = old_perms; -+ /* No failure possible as above. */ -+ domain_entry_inc(conn, node); -+ -+ errno = saved_errno; - return errno; -+ } - - fire_watches(conn, in, name, node, false, &old_perms); - send_ack(conn, XS_SET_PERMS); -@@ -3122,7 +3147,9 @@ void read_state_node(const void *ctx, const void *state) - set_tdb_key(name, &key); - if (write_node_raw(NULL, &key, node, true)) - barf("write node error restoring node"); -- domain_entry_inc(&conn, node); -+ -+ if (domain_entry_inc(&conn, node)) -+ barf("node accounting error restoring node"); - - talloc_free(node); - } -diff --git a/tools/xenstore/xenstored_domain.c b/tools/xenstore/xenstored_domain.c -index 850085a92c76..260952e09096 100644 ---- a/tools/xenstore/xenstored_domain.c -+++ b/tools/xenstore/xenstored_domain.c -@@ -16,6 +16,7 @@ - along with this program; If not, see <http://www.gnu.org/licenses/>. - */ - -+#include <assert.h> - #include <stdio.h> - #include <sys/mman.h> - #include <unistd.h> -@@ -363,6 +364,18 @@ static struct domain *find_or_alloc_domain(const void *ctx, unsigned int domid) - return domain ? : alloc_domain(ctx, domid); - } - -+static struct domain *find_or_alloc_existing_domain(unsigned int domid) -+{ -+ struct domain *domain; -+ xc_dominfo_t dominfo; -+ -+ domain = find_domain_struct(domid); -+ if (!domain && get_domain_info(domid, &dominfo)) -+ domain = alloc_domain(NULL, domid); -+ -+ return domain; -+} -+ - static int new_domain(struct domain *domain, int port, bool restore) - { - int rc; -@@ -782,30 +795,28 @@ void domain_deinit(void) - xenevtchn_unbind(xce_handle, virq_port); - } - --void domain_entry_inc(struct connection *conn, struct node *node) -+int domain_entry_inc(struct connection *conn, struct node *node) - { - struct domain *d; -+ unsigned int domid; - - if (!conn) -- return; -+ return 0; - -- if (node->perms.p && node->perms.p[0].id != conn->id) { -- if (conn->transaction) { -- transaction_entry_inc(conn->transaction, -- node->perms.p[0].id); -- } else { -- d = find_domain_by_domid(node->perms.p[0].id); -- if (d) -- d->nbentry++; -- } -- } else if (conn->domain) { -- if (conn->transaction) { -- transaction_entry_inc(conn->transaction, -- conn->domain->domid); -- } else { -- conn->domain->nbentry++; -- } -+ domid = node->perms.p ? node->perms.p[0].id : conn->id; -+ -+ if (conn->transaction) { -+ transaction_entry_inc(conn->transaction, domid); -+ } else { -+ d = (domid == conn->id && conn->domain) ? conn->domain -+ : find_or_alloc_existing_domain(domid); -+ if (d) -+ d->nbentry++; -+ else -+ return ENOMEM; - } -+ -+ return 0; - } - - /* -@@ -841,7 +852,7 @@ static int chk_domain_generation(unsigned int domid, uint64_t gen) - * Remove permissions for no longer existing domains in order to avoid a new - * domain with the same domid inheriting the permissions. - */ --int domain_adjust_node_perms(struct node *node) -+int domain_adjust_node_perms(struct connection *conn, struct node *node) - { - unsigned int i; - int ret; -@@ -851,8 +862,14 @@ int domain_adjust_node_perms(struct node *node) - return errno; - - /* If the owner doesn't exist any longer give it to priv domain. */ -- if (!ret) -+ if (!ret) { -+ /* -+ * In theory we'd need to update the number of dom0 nodes here, -+ * but we could be called for a read of the node. So better -+ * avoid the risk to overflow the node count of dom0. -+ */ - node->perms.p[0].id = priv_domid; -+ } - - for (i = 1; i < node->perms.num; i++) { - if (node->perms.p[i].perms & XS_PERM_IGNORE) -@@ -871,25 +888,25 @@ int domain_adjust_node_perms(struct node *node) - void domain_entry_dec(struct connection *conn, struct node *node) - { - struct domain *d; -+ unsigned int domid; - - if (!conn) - return; - -- if (node->perms.p && node->perms.p[0].id != conn->id) { -- if (conn->transaction) { -- transaction_entry_dec(conn->transaction, -- node->perms.p[0].id); -+ domid = node->perms.p ? node->perms.p[0].id : conn->id; -+ -+ if (conn->transaction) { -+ transaction_entry_dec(conn->transaction, domid); -+ } else { -+ d = (domid == conn->id && conn->domain) ? conn->domain -+ : find_domain_struct(domid); -+ if (d) { -+ d->nbentry--; - } else { -- d = find_domain_by_domid(node->perms.p[0].id); -- if (d && d->nbentry) -- d->nbentry--; -- } -- } else if (conn->domain && conn->domain->nbentry) { -- if (conn->transaction) { -- transaction_entry_dec(conn->transaction, -- conn->domain->domid); -- } else { -- conn->domain->nbentry--; -+ errno = ENOENT; -+ corrupt(conn, -+ "Node \"%s\" owned by non-existing domain %u\n", -+ node->name, domid); - } - } - } -@@ -899,13 +916,23 @@ int domain_entry_fix(unsigned int domid, int num, bool update) - struct domain *d; - int cnt; - -- d = find_domain_by_domid(domid); -- if (!d) -- return 0; -+ if (update) { -+ d = find_domain_struct(domid); -+ assert(d); -+ } else { -+ /* -+ * We are called first with update == false in order to catch -+ * any error. So do a possible allocation and check for error -+ * only in this case, as in the case of update == true nothing -+ * can go wrong anymore as the allocation already happened. -+ */ -+ d = find_or_alloc_existing_domain(domid); -+ if (!d) -+ return -1; -+ } - - cnt = d->nbentry + num; -- if (cnt < 0) -- cnt = 0; -+ assert(cnt >= 0); - - if (update) - d->nbentry = cnt; -diff --git a/tools/xenstore/xenstored_domain.h b/tools/xenstore/xenstored_domain.h -index 4f51b005291a..d6519904d831 100644 ---- a/tools/xenstore/xenstored_domain.h -+++ b/tools/xenstore/xenstored_domain.h -@@ -54,10 +54,10 @@ const char *get_implicit_path(const struct connection *conn); - bool domain_is_unprivileged(struct connection *conn); - - /* Remove node permissions for no longer existing domains. */ --int domain_adjust_node_perms(struct node *node); -+int domain_adjust_node_perms(struct connection *conn, struct node *node); - - /* Quota manipulation */ --void domain_entry_inc(struct connection *conn, struct node *); -+int domain_entry_inc(struct connection *conn, struct node *); - void domain_entry_dec(struct connection *conn, struct node *); - int domain_entry_fix(unsigned int domid, int num, bool update); - int domain_entry(struct connection *conn); -diff --git a/tools/xenstore/xenstored_transaction.c b/tools/xenstore/xenstored_transaction.c -index ee1b09031a3b..86caf6c398be 100644 ---- a/tools/xenstore/xenstored_transaction.c -+++ b/tools/xenstore/xenstored_transaction.c -@@ -519,8 +519,12 @@ static int transaction_fix_domains(struct transaction *trans, bool update) - - list_for_each_entry(d, &trans->changed_domains, list) { - cnt = domain_entry_fix(d->domid, d->nbentry, update); -- if (!update && cnt >= quota_nb_entry_per_domain) -- return ENOSPC; -+ if (!update) { -+ if (cnt >= quota_nb_entry_per_domain) -+ return ENOSPC; -+ if (cnt < 0) -+ return ENOMEM; -+ } - } - - return 0; --- -2.37.4 - diff --git a/0053-x86-ucode-Fix-error-paths-control_thread_fn.patch b/0053-x86-ucode-Fix-error-paths-control_thread_fn.patch new file mode 100644 index 0000000..7bb2c27 --- /dev/null +++ b/0053-x86-ucode-Fix-error-paths-control_thread_fn.patch @@ -0,0 +1,56 @@ +From 0f81c5a2c8e0432d5af3d9f4e6398376cd514516 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper <andrew.cooper3@citrix.com> +Date: Fri, 31 Mar 2023 08:40:56 +0200 +Subject: [PATCH 53/61] x86/ucode: Fix error paths control_thread_fn() + +These two early exits skipped re-enabling the watchdog, restoring the NMI +callback, and clearing the nmi_patch global pointer. Always execute the tail +of the function on the way out. + +Fixes: 8dd4dfa92d62 ("x86/microcode: Synchronize late microcode loading") +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Sergey Dyasli <sergey.dyasli@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +master commit: fc2e1f3aad602a66c14b8285a1bd38a82f8fd02d +master date: 2023-03-28 11:57:56 +0100 +--- + xen/arch/x86/cpu/microcode/core.c | 9 +++------ + 1 file changed, 3 insertions(+), 6 deletions(-) + +diff --git a/xen/arch/x86/cpu/microcode/core.c b/xen/arch/x86/cpu/microcode/core.c +index ee7df9a591..ad150e5963 100644 +--- a/xen/arch/x86/cpu/microcode/core.c ++++ b/xen/arch/x86/cpu/microcode/core.c +@@ -488,10 +488,7 @@ static int control_thread_fn(const struct microcode_patch *patch) + ret = wait_for_condition(wait_cpu_callin, num_online_cpus(), + MICROCODE_CALLIN_TIMEOUT_US); + if ( ret ) +- { +- set_state(LOADING_EXIT); +- return ret; +- } ++ goto out; + + /* Control thread loads ucode first while others are in NMI handler. */ + ret = microcode_ops->apply_microcode(patch); +@@ -503,8 +500,7 @@ static int control_thread_fn(const struct microcode_patch *patch) + { + printk(XENLOG_ERR + "Late loading aborted: CPU%u failed to update ucode\n", cpu); +- set_state(LOADING_EXIT); +- return ret; ++ goto out; + } + + /* Let primary threads load the given ucode update */ +@@ -535,6 +531,7 @@ static int control_thread_fn(const struct microcode_patch *patch) + } + } + ++ out: + /* Mark loading is done to unblock other threads */ + set_state(LOADING_EXIT); + +-- +2.40.0 + diff --git a/0054-tools-xenstore-limit-max-number-of-nodes-accessed-in.patch b/0054-tools-xenstore-limit-max-number-of-nodes-accessed-in.patch deleted file mode 100644 index 0a84c6c..0000000 --- a/0054-tools-xenstore-limit-max-number-of-nodes-accessed-in.patch +++ /dev/null @@ -1,255 +0,0 @@ -From 7017cfefc455db535054ebc09124af8101746a4a Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:09 +0200 -Subject: [PATCH 54/87] tools/xenstore: limit max number of nodes accessed in a - transaction - -Today a guest is free to access as many nodes in a single transaction -as it wants. This can lead to unbounded memory consumption in Xenstore -as there is the need to keep track of all nodes having been accessed -during a transaction. - -In oxenstored the number of requests in a transaction is being limited -via a quota maxrequests (default is 1024). As multiple accesses of a -node are not problematic in C Xenstore, limit the number of accessed -nodes. - -In order to let read_node() detect a quota error in case too many nodes -are being accessed, check the return value of access_node() and return -NULL in case an error has been seen. Introduce __must_check and add it -to the access_node() prototype. - -This is part of XSA-326 / CVE-2022-42314. - -Suggested-by: Julien Grall <julien@xen.org> -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit 268369d8e322d227a74a899009c5748d7b0ea142) ---- - tools/include/xen-tools/libs.h | 4 +++ - tools/xenstore/xenstored_core.c | 50 ++++++++++++++++++-------- - tools/xenstore/xenstored_core.h | 1 + - tools/xenstore/xenstored_transaction.c | 9 +++++ - tools/xenstore/xenstored_transaction.h | 4 +-- - 5 files changed, 52 insertions(+), 16 deletions(-) - -diff --git a/tools/include/xen-tools/libs.h b/tools/include/xen-tools/libs.h -index a16e0c380709..bafc90e2f603 100644 ---- a/tools/include/xen-tools/libs.h -+++ b/tools/include/xen-tools/libs.h -@@ -63,4 +63,8 @@ - #define ROUNDUP(_x,_w) (((unsigned long)(_x)+(1UL<<(_w))-1) & ~((1UL<<(_w))-1)) - #endif - -+#ifndef __must_check -+#define __must_check __attribute__((__warn_unused_result__)) -+#endif -+ - #endif /* __XEN_TOOLS_LIBS__ */ -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 692d863fce35..f835aa1b2f1f 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -106,6 +106,7 @@ int quota_nb_watch_per_domain = 128; - int quota_max_entry_size = 2048; /* 2K */ - int quota_max_transaction = 10; - int quota_nb_perms_per_node = 5; -+int quota_trans_nodes = 1024; - int quota_max_path_len = XENSTORE_REL_PATH_MAX; - int quota_req_outstanding = 20; - -@@ -595,6 +596,7 @@ struct node *read_node(struct connection *conn, const void *ctx, - TDB_DATA key, data; - struct xs_tdb_record_hdr *hdr; - struct node *node; -+ int err; - - node = talloc(ctx, struct node); - if (!node) { -@@ -616,14 +618,13 @@ struct node *read_node(struct connection *conn, const void *ctx, - if (data.dptr == NULL) { - if (tdb_error(tdb_ctx) == TDB_ERR_NOEXIST) { - node->generation = NO_GENERATION; -- access_node(conn, node, NODE_ACCESS_READ, NULL); -- errno = ENOENT; -+ err = access_node(conn, node, NODE_ACCESS_READ, NULL); -+ errno = err ? : ENOENT; - } else { - log("TDB error on read: %s", tdb_errorstr(tdb_ctx)); - errno = EIO; - } -- talloc_free(node); -- return NULL; -+ goto error; - } - - node->parent = NULL; -@@ -638,19 +639,36 @@ struct node *read_node(struct connection *conn, const void *ctx, - - /* Permissions are struct xs_permissions. */ - node->perms.p = hdr->perms; -- if (domain_adjust_node_perms(conn, node)) { -- talloc_free(node); -- return NULL; -- } -+ if (domain_adjust_node_perms(conn, node)) -+ goto error; - - /* Data is binary blob (usually ascii, no nul). */ - node->data = node->perms.p + hdr->num_perms; - /* Children is strings, nul separated. */ - node->children = node->data + node->datalen; - -- access_node(conn, node, NODE_ACCESS_READ, NULL); -+ if (access_node(conn, node, NODE_ACCESS_READ, NULL)) -+ goto error; - - return node; -+ -+ error: -+ err = errno; -+ talloc_free(node); -+ errno = err; -+ return NULL; -+} -+ -+static bool read_node_can_propagate_errno(void) -+{ -+ /* -+ * 2 error cases for read_node() can always be propagated up: -+ * ENOMEM, because this has nothing to do with the node being in the -+ * data base or not, but is caused by a general lack of memory. -+ * ENOSPC, because this is related to hitting quota limits which need -+ * to be respected. -+ */ -+ return errno == ENOMEM || errno == ENOSPC; - } - - int write_node_raw(struct connection *conn, TDB_DATA *key, struct node *node, -@@ -767,7 +785,7 @@ static int ask_parents(struct connection *conn, const void *ctx, - node = read_node(conn, ctx, name); - if (node) - break; -- if (errno == ENOMEM) -+ if (read_node_can_propagate_errno()) - return errno; - } while (!streq(name, "/")); - -@@ -829,7 +847,7 @@ static struct node *get_node(struct connection *conn, - } - } - /* Clean up errno if they weren't supposed to know. */ -- if (!node && errno != ENOMEM) -+ if (!node && !read_node_can_propagate_errno()) - errno = errno_from_parents(conn, ctx, name, errno, perm); - return node; - } -@@ -1235,7 +1253,7 @@ static struct node *construct_node(struct connection *conn, const void *ctx, - - /* If parent doesn't exist, create it. */ - parent = read_node(conn, parentname, parentname); -- if (!parent) -+ if (!parent && errno == ENOENT) - parent = construct_node(conn, ctx, parentname); - if (!parent) - return NULL; -@@ -1509,7 +1527,7 @@ static int _rm(struct connection *conn, const void *ctx, struct node *node, - - parent = read_node(conn, ctx, parentname); - if (!parent) -- return (errno == ENOMEM) ? ENOMEM : EINVAL; -+ return read_node_can_propagate_errno() ? errno : EINVAL; - node->parent = parent; - - return delete_node(conn, ctx, parent, node, false); -@@ -1539,7 +1557,7 @@ static int do_rm(struct connection *conn, struct buffered_data *in) - return 0; - } - /* Restore errno, just in case. */ -- if (errno != ENOMEM) -+ if (!read_node_can_propagate_errno()) - errno = ENOENT; - } - return errno; -@@ -2384,6 +2402,8 @@ static void usage(void) - " -M, --path-max <chars> limit the allowed Xenstore node path length,\n" - " -Q, --quota <what>=<nb> set the quota <what> to the value <nb>, allowed\n" - " quotas are:\n" -+" transaction-nodes: number of accessed node per\n" -+" transaction\n" - " outstanding: number of outstanding requests\n" - " -w, --timeout <what>=<seconds> set the timeout in seconds for <what>,\n" - " allowed timeout candidates are:\n" -@@ -2468,6 +2488,8 @@ static void set_quota(const char *arg) - val = get_optval_int(eq + 1); - if (what_matches(arg, "outstanding")) - quota_req_outstanding = val; -+ else if (what_matches(arg, "transaction-nodes")) -+ quota_trans_nodes = val; - else - barf("unknown quota \"%s\"\n", arg); - } -diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h -index b1a70488b989..245f9258235f 100644 ---- a/tools/xenstore/xenstored_core.h -+++ b/tools/xenstore/xenstored_core.h -@@ -268,6 +268,7 @@ extern int dom0_event; - extern int priv_domid; - extern int quota_nb_entry_per_domain; - extern int quota_req_outstanding; -+extern int quota_trans_nodes; - - extern unsigned int timeout_watch_event_msec; - -diff --git a/tools/xenstore/xenstored_transaction.c b/tools/xenstore/xenstored_transaction.c -index 86caf6c398be..7bd41eb475e3 100644 ---- a/tools/xenstore/xenstored_transaction.c -+++ b/tools/xenstore/xenstored_transaction.c -@@ -156,6 +156,9 @@ struct transaction - /* Connection-local identifier for this transaction. */ - uint32_t id; - -+ /* Node counter. */ -+ unsigned int nodes; -+ - /* Generation when transaction started. */ - uint64_t generation; - -@@ -260,6 +263,11 @@ int access_node(struct connection *conn, struct node *node, - - i = find_accessed_node(trans, node->name); - if (!i) { -+ if (trans->nodes >= quota_trans_nodes && -+ domain_is_unprivileged(conn)) { -+ ret = ENOSPC; -+ goto err; -+ } - i = talloc_zero(trans, struct accessed_node); - if (!i) - goto nomem; -@@ -297,6 +305,7 @@ int access_node(struct connection *conn, struct node *node, - i->ta_node = true; - } - } -+ trans->nodes++; - list_add_tail(&i->list, &trans->accessed); - } - -diff --git a/tools/xenstore/xenstored_transaction.h b/tools/xenstore/xenstored_transaction.h -index 0093cac807e3..e3cbd6b23095 100644 ---- a/tools/xenstore/xenstored_transaction.h -+++ b/tools/xenstore/xenstored_transaction.h -@@ -39,8 +39,8 @@ void transaction_entry_inc(struct transaction *trans, unsigned int domid); - void transaction_entry_dec(struct transaction *trans, unsigned int domid); - - /* This node was accessed. */ --int access_node(struct connection *conn, struct node *node, -- enum node_access_type type, TDB_DATA *key); -+int __must_check access_node(struct connection *conn, struct node *node, -+ enum node_access_type type, TDB_DATA *key); - - /* Queue watches for a modified node. */ - void queue_watches(struct connection *conn, const char *name, bool watch_exact); --- -2.37.4 - diff --git a/0054-vpci-msix-handle-accesses-adjacent-to-the-MSI-X-tabl.patch b/0054-vpci-msix-handle-accesses-adjacent-to-the-MSI-X-tabl.patch new file mode 100644 index 0000000..4973ae7 --- /dev/null +++ b/0054-vpci-msix-handle-accesses-adjacent-to-the-MSI-X-tabl.patch @@ -0,0 +1,543 @@ +From d080287c2a8dce11baee1d7bbf9276757e8572e4 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> +Date: Fri, 31 Mar 2023 08:41:27 +0200 +Subject: [PATCH 54/61] vpci/msix: handle accesses adjacent to the MSI-X table +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The handling of the MSI-X table accesses by Xen requires that any +pages part of the MSI-X related tables are not mapped into the domain +physmap. As a result, any device registers in the same pages as the +start or the end of the MSIX or PBA tables is not currently +accessible, as the accesses are just dropped. + +Note the spec forbids such placing of registers, as the MSIX and PBA +tables must be 4K isolated from any other registers: + +"If a Base Address register that maps address space for the MSI-X +Table or MSI-X PBA also maps other usable address space that is not +associated with MSI-X structures, locations (e.g., for CSRs) used in +the other address space must not share any naturally aligned 4-KB +address range with one where either MSI-X structure resides." + +Yet the 'Intel Wi-Fi 6 AX201' device on one of my boxes has registers +in the same page as the MSIX tables, and thus won't work on a PVH dom0 +without this fix. + +In order to cope with the behavior passthrough any accesses that fall +on the same page as the MSIX tables (but don't fall in between) to the +underlying hardware. Such forwarding also takes care of the PBA +accesses, so it allows to remove the code doing this handling in +msix_{read,write}. Note that as a result accesses to the PBA array +are no longer limited to 4 and 8 byte sizes, there's no access size +restriction for PBA accesses documented in the specification. + +Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> + +vpci/msix: restore PBA access length and alignment restrictions + +Accesses to the PBA array have the same length and alignment +limitations as accesses to the MSI-X table: + +"For all accesses to MSI-X Table and MSI-X PBA fields, software must +use aligned full DWORD or aligned full QWORD transactions; otherwise, +the result is undefined." + +Introduce such length and alignment checks into the handling of PBA +accesses for vPCI. This was a mistake of mine for not reading the +specification correctly. + +Note that accesses must now be aligned, and hence there's no longer a +need to check that the end of the access falls into the PBA region as +both the access and the region addresses must be aligned. + +Fixes: b177892d2d ('vpci/msix: handle accesses adjacent to the MSI-X table') +Reported-by: Jan Beulich <jbeulich@suse.com> +Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +master commit: b177892d2d0e8a31122c218989f43130aeba5282 +master date: 2023-03-28 14:20:35 +0200 +master commit: 7a502b4fbc339e9d3d3d45fb37f09da06bc3081c +master date: 2023-03-29 14:56:33 +0200 +--- + xen/drivers/vpci/msix.c | 357 +++++++++++++++++++++++++++++----------- + xen/drivers/vpci/vpci.c | 7 +- + xen/include/xen/vpci.h | 8 +- + 3 files changed, 275 insertions(+), 97 deletions(-) + +diff --git a/xen/drivers/vpci/msix.c b/xen/drivers/vpci/msix.c +index ea5d73a02a..7e1bfb2f0a 100644 +--- a/xen/drivers/vpci/msix.c ++++ b/xen/drivers/vpci/msix.c +@@ -27,6 +27,11 @@ + ((addr) >= vmsix_table_addr(vpci, nr) && \ + (addr) < vmsix_table_addr(vpci, nr) + vmsix_table_size(vpci, nr)) + ++#define VMSIX_ADDR_SAME_PAGE(addr, vpci, nr) \ ++ (PFN_DOWN(addr) >= PFN_DOWN(vmsix_table_addr(vpci, nr)) && \ ++ PFN_DOWN(addr) <= PFN_DOWN(vmsix_table_addr(vpci, nr) + \ ++ vmsix_table_size(vpci, nr) - 1)) ++ + static uint32_t control_read(const struct pci_dev *pdev, unsigned int reg, + void *data) + { +@@ -149,7 +154,7 @@ static struct vpci_msix *msix_find(const struct domain *d, unsigned long addr) + + for ( i = 0; i < ARRAY_SIZE(msix->tables); i++ ) + if ( bars[msix->tables[i] & PCI_MSIX_BIRMASK].enabled && +- VMSIX_ADDR_IN_RANGE(addr, msix->pdev->vpci, i) ) ++ VMSIX_ADDR_SAME_PAGE(addr, msix->pdev->vpci, i) ) + return msix; + } + +@@ -182,36 +187,172 @@ static struct vpci_msix_entry *get_entry(struct vpci_msix *msix, + return &msix->entries[(addr - start) / PCI_MSIX_ENTRY_SIZE]; + } + +-static void __iomem *get_pba(struct vpci *vpci) ++static void __iomem *get_table(struct vpci *vpci, unsigned int slot) + { + struct vpci_msix *msix = vpci->msix; ++ paddr_t addr = 0; ++ ++ ASSERT(spin_is_locked(&vpci->lock)); ++ ++ if ( likely(msix->table[slot]) ) ++ return msix->table[slot]; ++ ++ switch ( slot ) ++ { ++ case VPCI_MSIX_TBL_TAIL: ++ addr = vmsix_table_size(vpci, VPCI_MSIX_TABLE); ++ fallthrough; ++ case VPCI_MSIX_TBL_HEAD: ++ addr += vmsix_table_addr(vpci, VPCI_MSIX_TABLE); ++ break; ++ ++ case VPCI_MSIX_PBA_TAIL: ++ addr = vmsix_table_size(vpci, VPCI_MSIX_PBA); ++ fallthrough; ++ case VPCI_MSIX_PBA_HEAD: ++ addr += vmsix_table_addr(vpci, VPCI_MSIX_PBA); ++ break; ++ ++ default: ++ ASSERT_UNREACHABLE(); ++ return NULL; ++ } ++ ++ msix->table[slot] = ioremap(round_pgdown(addr), PAGE_SIZE); ++ ++ return msix->table[slot]; ++} ++ ++unsigned int get_slot(const struct vpci *vpci, unsigned long addr) ++{ ++ unsigned long pfn = PFN_DOWN(addr); ++ + /* +- * PBA will only be unmapped when the device is deassigned, so access it +- * without holding the vpci lock. ++ * The logic below relies on having the tables identity mapped to the guest ++ * address space, or for the `addr` parameter to be translated into its ++ * host physical memory address equivalent. + */ +- void __iomem *pba = read_atomic(&msix->pba); + +- if ( likely(pba) ) +- return pba; ++ if ( pfn == PFN_DOWN(vmsix_table_addr(vpci, VPCI_MSIX_TABLE)) ) ++ return VPCI_MSIX_TBL_HEAD; ++ if ( pfn == PFN_DOWN(vmsix_table_addr(vpci, VPCI_MSIX_TABLE) + ++ vmsix_table_size(vpci, VPCI_MSIX_TABLE) - 1) ) ++ return VPCI_MSIX_TBL_TAIL; ++ if ( pfn == PFN_DOWN(vmsix_table_addr(vpci, VPCI_MSIX_PBA)) ) ++ return VPCI_MSIX_PBA_HEAD; ++ if ( pfn == PFN_DOWN(vmsix_table_addr(vpci, VPCI_MSIX_PBA) + ++ vmsix_table_size(vpci, VPCI_MSIX_PBA) - 1) ) ++ return VPCI_MSIX_PBA_TAIL; ++ ++ ASSERT_UNREACHABLE(); ++ return -1; ++} ++ ++static bool adjacent_handle(const struct vpci_msix *msix, unsigned long addr) ++{ ++ unsigned int i; ++ ++ if ( VMSIX_ADDR_IN_RANGE(addr, msix->pdev->vpci, VPCI_MSIX_PBA) ) ++ return true; ++ ++ if ( VMSIX_ADDR_IN_RANGE(addr, msix->pdev->vpci, VPCI_MSIX_TABLE) ) ++ return false; ++ ++ for ( i = 0; i < ARRAY_SIZE(msix->tables); i++ ) ++ if ( VMSIX_ADDR_SAME_PAGE(addr, msix->pdev->vpci, i) ) ++ return true; ++ ++ return false; ++} ++ ++static int adjacent_read(const struct domain *d, const struct vpci_msix *msix, ++ unsigned long addr, unsigned int len, ++ unsigned long *data) ++{ ++ const void __iomem *mem; ++ struct vpci *vpci = msix->pdev->vpci; ++ unsigned int slot; ++ ++ *data = ~0ul; ++ ++ if ( !adjacent_handle(msix, addr + len - 1) ) ++ return X86EMUL_OKAY; ++ ++ if ( VMSIX_ADDR_IN_RANGE(addr, vpci, VPCI_MSIX_PBA) && ++ !access_allowed(msix->pdev, addr, len) ) ++ /* PBA accesses must be aligned and 4 or 8 bytes in size. */ ++ return X86EMUL_OKAY; ++ ++ slot = get_slot(vpci, addr); ++ if ( slot >= ARRAY_SIZE(msix->table) ) ++ return X86EMUL_OKAY; ++ ++ if ( unlikely(!IS_ALIGNED(addr, len)) ) ++ { ++ unsigned int i; + +- pba = ioremap(vmsix_table_addr(vpci, VPCI_MSIX_PBA), +- vmsix_table_size(vpci, VPCI_MSIX_PBA)); +- if ( !pba ) +- return read_atomic(&msix->pba); ++ gprintk(XENLOG_DEBUG, "%pp: unaligned read to MSI-X related page\n", ++ &msix->pdev->sbdf); ++ ++ /* ++ * Split unaligned accesses into byte sized ones. Shouldn't happen in ++ * the first place, but devices shouldn't have registers in the same 4K ++ * page as the MSIX tables either. ++ * ++ * It's unclear whether this could cause issues if a guest expects ++ * registers to be accessed atomically, it better use an aligned access ++ * if it has such expectations. ++ */ ++ for ( i = 0; i < len; i++ ) ++ { ++ unsigned long partial = ~0ul; ++ int rc = adjacent_read(d, msix, addr + i, 1, &partial); ++ ++ if ( rc != X86EMUL_OKAY ) ++ return rc; ++ ++ *data &= ~(0xfful << (i * 8)); ++ *data |= (partial & 0xff) << (i * 8); ++ } ++ ++ return X86EMUL_OKAY; ++ } + + spin_lock(&vpci->lock); +- if ( !msix->pba ) ++ mem = get_table(vpci, slot); ++ if ( !mem ) + { +- write_atomic(&msix->pba, pba); + spin_unlock(&vpci->lock); ++ gprintk(XENLOG_WARNING, ++ "%pp: unable to map MSI-X page, returning all bits set\n", ++ &msix->pdev->sbdf); ++ return X86EMUL_OKAY; + } +- else ++ ++ switch ( len ) + { +- spin_unlock(&vpci->lock); +- iounmap(pba); ++ case 1: ++ *data = readb(mem + PAGE_OFFSET(addr)); ++ break; ++ ++ case 2: ++ *data = readw(mem + PAGE_OFFSET(addr)); ++ break; ++ ++ case 4: ++ *data = readl(mem + PAGE_OFFSET(addr)); ++ break; ++ ++ case 8: ++ *data = readq(mem + PAGE_OFFSET(addr)); ++ break; ++ ++ default: ++ ASSERT_UNREACHABLE(); + } ++ spin_unlock(&vpci->lock); + +- return read_atomic(&msix->pba); ++ return X86EMUL_OKAY; + } + + static int msix_read(struct vcpu *v, unsigned long addr, unsigned int len, +@@ -227,47 +368,11 @@ static int msix_read(struct vcpu *v, unsigned long addr, unsigned int len, + if ( !msix ) + return X86EMUL_RETRY; + +- if ( !access_allowed(msix->pdev, addr, len) ) +- return X86EMUL_OKAY; +- +- if ( VMSIX_ADDR_IN_RANGE(addr, msix->pdev->vpci, VPCI_MSIX_PBA) ) +- { +- struct vpci *vpci = msix->pdev->vpci; +- unsigned int idx = addr - vmsix_table_addr(vpci, VPCI_MSIX_PBA); +- const void __iomem *pba = get_pba(vpci); +- +- /* +- * Access to PBA. +- * +- * TODO: note that this relies on having the PBA identity mapped to the +- * guest address space. If this changes the address will need to be +- * translated. +- */ +- if ( !pba ) +- { +- gprintk(XENLOG_WARNING, +- "%pp: unable to map MSI-X PBA, report all pending\n", +- &msix->pdev->sbdf); +- return X86EMUL_OKAY; +- } +- +- switch ( len ) +- { +- case 4: +- *data = readl(pba + idx); +- break; +- +- case 8: +- *data = readq(pba + idx); +- break; +- +- default: +- ASSERT_UNREACHABLE(); +- break; +- } ++ if ( adjacent_handle(msix, addr) ) ++ return adjacent_read(d, msix, addr, len, data); + ++ if ( !access_allowed(msix->pdev, addr, len) ) + return X86EMUL_OKAY; +- } + + spin_lock(&msix->pdev->vpci->lock); + entry = get_entry(msix, addr); +@@ -303,57 +408,103 @@ static int msix_read(struct vcpu *v, unsigned long addr, unsigned int len, + return X86EMUL_OKAY; + } + +-static int msix_write(struct vcpu *v, unsigned long addr, unsigned int len, +- unsigned long data) ++static int adjacent_write(const struct domain *d, const struct vpci_msix *msix, ++ unsigned long addr, unsigned int len, ++ unsigned long data) + { +- const struct domain *d = v->domain; +- struct vpci_msix *msix = msix_find(d, addr); +- struct vpci_msix_entry *entry; +- unsigned int offset; ++ void __iomem *mem; ++ struct vpci *vpci = msix->pdev->vpci; ++ unsigned int slot; + +- if ( !msix ) +- return X86EMUL_RETRY; ++ if ( !adjacent_handle(msix, addr + len - 1) ) ++ return X86EMUL_OKAY; + +- if ( !access_allowed(msix->pdev, addr, len) ) ++ /* ++ * Only check start and end of the access because the size of the PBA is ++ * assumed to be equal or bigger (8 bytes) than the length of any access ++ * handled here. ++ */ ++ if ( VMSIX_ADDR_IN_RANGE(addr, vpci, VPCI_MSIX_PBA) && ++ (!access_allowed(msix->pdev, addr, len) || !is_hardware_domain(d)) ) ++ /* Ignore writes to PBA for DomUs, it's undefined behavior. */ + return X86EMUL_OKAY; + +- if ( VMSIX_ADDR_IN_RANGE(addr, msix->pdev->vpci, VPCI_MSIX_PBA) ) +- { +- /* Ignore writes to PBA for DomUs, it's behavior is undefined. */ +- if ( is_hardware_domain(d) ) +- { +- struct vpci *vpci = msix->pdev->vpci; +- unsigned int idx = addr - vmsix_table_addr(vpci, VPCI_MSIX_PBA); +- const void __iomem *pba = get_pba(vpci); ++ slot = get_slot(vpci, addr); ++ if ( slot >= ARRAY_SIZE(msix->table) ) ++ return X86EMUL_OKAY; + +- if ( !pba ) +- { +- /* Unable to map the PBA, ignore write. */ +- gprintk(XENLOG_WARNING, +- "%pp: unable to map MSI-X PBA, write ignored\n", +- &msix->pdev->sbdf); +- return X86EMUL_OKAY; +- } ++ if ( unlikely(!IS_ALIGNED(addr, len)) ) ++ { ++ unsigned int i; + +- switch ( len ) +- { +- case 4: +- writel(data, pba + idx); +- break; ++ gprintk(XENLOG_DEBUG, "%pp: unaligned write to MSI-X related page\n", ++ &msix->pdev->sbdf); + +- case 8: +- writeq(data, pba + idx); +- break; ++ for ( i = 0; i < len; i++ ) ++ { ++ int rc = adjacent_write(d, msix, addr + i, 1, data >> (i * 8)); + +- default: +- ASSERT_UNREACHABLE(); +- break; +- } ++ if ( rc != X86EMUL_OKAY ) ++ return rc; + } + + return X86EMUL_OKAY; + } + ++ spin_lock(&vpci->lock); ++ mem = get_table(vpci, slot); ++ if ( !mem ) ++ { ++ spin_unlock(&vpci->lock); ++ gprintk(XENLOG_WARNING, ++ "%pp: unable to map MSI-X page, dropping write\n", ++ &msix->pdev->sbdf); ++ return X86EMUL_OKAY; ++ } ++ ++ switch ( len ) ++ { ++ case 1: ++ writeb(data, mem + PAGE_OFFSET(addr)); ++ break; ++ ++ case 2: ++ writew(data, mem + PAGE_OFFSET(addr)); ++ break; ++ ++ case 4: ++ writel(data, mem + PAGE_OFFSET(addr)); ++ break; ++ ++ case 8: ++ writeq(data, mem + PAGE_OFFSET(addr)); ++ break; ++ ++ default: ++ ASSERT_UNREACHABLE(); ++ } ++ spin_unlock(&vpci->lock); ++ ++ return X86EMUL_OKAY; ++} ++ ++static int msix_write(struct vcpu *v, unsigned long addr, unsigned int len, ++ unsigned long data) ++{ ++ const struct domain *d = v->domain; ++ struct vpci_msix *msix = msix_find(d, addr); ++ struct vpci_msix_entry *entry; ++ unsigned int offset; ++ ++ if ( !msix ) ++ return X86EMUL_RETRY; ++ ++ if ( adjacent_handle(msix, addr) ) ++ return adjacent_write(d, msix, addr, len, data); ++ ++ if ( !access_allowed(msix->pdev, addr, len) ) ++ return X86EMUL_OKAY; ++ + spin_lock(&msix->pdev->vpci->lock); + entry = get_entry(msix, addr); + offset = addr & (PCI_MSIX_ENTRY_SIZE - 1); +@@ -482,6 +633,26 @@ int vpci_make_msix_hole(const struct pci_dev *pdev) + } + } + ++ if ( is_hardware_domain(d) ) ++ { ++ /* ++ * For dom0 only: remove any hypervisor mappings of the MSIX or PBA ++ * related areas, as dom0 is capable of moving the position of the BARs ++ * in the host address space. ++ * ++ * We rely on being called with the vPCI lock held once the domain is ++ * running, so the maps are not in use. ++ */ ++ for ( i = 0; i < ARRAY_SIZE(pdev->vpci->msix->table); i++ ) ++ if ( pdev->vpci->msix->table[i] ) ++ { ++ /* If there are any maps, the domain must be running. */ ++ ASSERT(spin_is_locked(&pdev->vpci->lock)); ++ iounmap(pdev->vpci->msix->table[i]); ++ pdev->vpci->msix->table[i] = NULL; ++ } ++ } ++ + return 0; + } + +diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c +index b9339f8f3e..60b5f45cd1 100644 +--- a/xen/drivers/vpci/vpci.c ++++ b/xen/drivers/vpci/vpci.c +@@ -53,9 +53,12 @@ void vpci_remove_device(struct pci_dev *pdev) + spin_unlock(&pdev->vpci->lock); + if ( pdev->vpci->msix ) + { ++ unsigned int i; ++ + list_del(&pdev->vpci->msix->next); +- if ( pdev->vpci->msix->pba ) +- iounmap(pdev->vpci->msix->pba); ++ for ( i = 0; i < ARRAY_SIZE(pdev->vpci->msix->table); i++ ) ++ if ( pdev->vpci->msix->table[i] ) ++ iounmap(pdev->vpci->msix->table[i]); + } + xfree(pdev->vpci->msix); + xfree(pdev->vpci->msi); +diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h +index 755b4fd5c8..3326d9026e 100644 +--- a/xen/include/xen/vpci.h ++++ b/xen/include/xen/vpci.h +@@ -129,8 +129,12 @@ struct vpci { + bool enabled : 1; + /* Masked? */ + bool masked : 1; +- /* PBA map */ +- void __iomem *pba; ++ /* Partial table map. */ ++#define VPCI_MSIX_TBL_HEAD 0 ++#define VPCI_MSIX_TBL_TAIL 1 ++#define VPCI_MSIX_PBA_HEAD 2 ++#define VPCI_MSIX_PBA_TAIL 3 ++ void __iomem *table[4]; + /* Entries. */ + struct vpci_msix_entry { + uint64_t addr; +-- +2.40.0 + diff --git a/0055-ns16550-correct-name-value-pair-parsing-for-PCI-port.patch b/0055-ns16550-correct-name-value-pair-parsing-for-PCI-port.patch new file mode 100644 index 0000000..9c05f3a --- /dev/null +++ b/0055-ns16550-correct-name-value-pair-parsing-for-PCI-port.patch @@ -0,0 +1,59 @@ +From 06264af090ac69a95cdadbc261cc82d964dcb568 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Fri, 31 Mar 2023 08:42:02 +0200 +Subject: [PATCH 55/61] ns16550: correct name/value pair parsing for PCI + port/bridge + +First of all these were inverted: "bridge=" caused the port coordinates +to be established, while "port=" controlled the bridge coordinates. And +then the error messages being identical also wasn't helpful. While +correcting this also move both case blocks close together. + +Fixes: 97fd49a7e074 ("ns16550: add support for UART parameters to be specifed with name-value pairs") +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> +master commit: e692b22230b411d762ac9e278a398e28df474eae +master date: 2023-03-29 14:55:37 +0200 +--- + xen/drivers/char/ns16550.c | 16 ++++++++-------- + 1 file changed, 8 insertions(+), 8 deletions(-) + +diff --git a/xen/drivers/char/ns16550.c b/xen/drivers/char/ns16550.c +index 5dd4d723f5..3651e0c0d4 100644 +--- a/xen/drivers/char/ns16550.c ++++ b/xen/drivers/char/ns16550.c +@@ -1536,13 +1536,6 @@ static bool __init parse_namevalue_pairs(char *str, struct ns16550 *uart) + break; + + #ifdef CONFIG_HAS_PCI +- case bridge_bdf: +- if ( !parse_pci(param_value, NULL, &uart->ps_bdf[0], +- &uart->ps_bdf[1], &uart->ps_bdf[2]) ) +- PARSE_ERR_RET("Bad port PCI coordinates\n"); +- uart->ps_bdf_enable = true; +- break; +- + case device: + if ( strncmp(param_value, "pci", 3) == 0 ) + { +@@ -1557,9 +1550,16 @@ static bool __init parse_namevalue_pairs(char *str, struct ns16550 *uart) + break; + + case port_bdf: ++ if ( !parse_pci(param_value, NULL, &uart->ps_bdf[0], ++ &uart->ps_bdf[1], &uart->ps_bdf[2]) ) ++ PARSE_ERR_RET("Bad port PCI coordinates\n"); ++ uart->ps_bdf_enable = true; ++ break; ++ ++ case bridge_bdf: + if ( !parse_pci(param_value, NULL, &uart->pb_bdf[0], + &uart->pb_bdf[1], &uart->pb_bdf[2]) ) +- PARSE_ERR_RET("Bad port PCI coordinates\n"); ++ PARSE_ERR_RET("Bad bridge PCI coordinates\n"); + uart->pb_bdf_enable = true; + break; + #endif +-- +2.40.0 + diff --git a/0055-tools-xenstore-move-the-call-of-setup_structure-to-d.patch b/0055-tools-xenstore-move-the-call-of-setup_structure-to-d.patch deleted file mode 100644 index 5a8abbd..0000000 --- a/0055-tools-xenstore-move-the-call-of-setup_structure-to-d.patch +++ /dev/null @@ -1,96 +0,0 @@ -From 2d39cf77d70b44b70f970da90187f48d2c0b3e96 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:09 +0200 -Subject: [PATCH 55/87] tools/xenstore: move the call of setup_structure() to - dom0 introduction - -Setting up the basic structure when introducing dom0 has the advantage -to be able to add proper node memory accounting for the added nodes -later. - -This makes it possible to do proper node accounting, too. - -An additional requirement to make that work fine is to correct the -owner of the created nodes to be dom0_domid instead of domid 0. - -This is part of XSA-326. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Acked-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit 60e2f6020dea7f616857b8fc1141b1c085d88761) ---- - tools/xenstore/xenstored_core.c | 9 ++++----- - tools/xenstore/xenstored_core.h | 1 + - tools/xenstore/xenstored_domain.c | 3 +++ - 3 files changed, 8 insertions(+), 5 deletions(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index f835aa1b2f1f..5171d34c947e 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -2039,7 +2039,8 @@ static int tdb_flags; - static void manual_node(const char *name, const char *child) - { - struct node *node; -- struct xs_permissions perms = { .id = 0, .perms = XS_PERM_NONE }; -+ struct xs_permissions perms = { .id = dom0_domid, -+ .perms = XS_PERM_NONE }; - - node = talloc_zero(NULL, struct node); - if (!node) -@@ -2078,7 +2079,7 @@ static void tdb_logger(TDB_CONTEXT *tdb, int level, const char * fmt, ...) - } - } - --static void setup_structure(bool live_update) -+void setup_structure(bool live_update) - { - char *tdbname; - -@@ -2101,6 +2102,7 @@ static void setup_structure(bool live_update) - manual_node("/", "tool"); - manual_node("/tool", "xenstored"); - manual_node("/tool/xenstored", NULL); -+ domain_entry_fix(dom0_domid, 3, true); - } - - check_store(); -@@ -2614,9 +2616,6 @@ int main(int argc, char *argv[]) - - init_pipe(reopen_log_pipe); - -- /* Setup the database */ -- setup_structure(live_update); -- - /* Listen to hypervisor. */ - if (!no_domain_init && !live_update) { - domain_init(-1); -diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h -index 245f9258235f..2c77ec7ee0f4 100644 ---- a/tools/xenstore/xenstored_core.h -+++ b/tools/xenstore/xenstored_core.h -@@ -231,6 +231,7 @@ int write_node_raw(struct connection *conn, TDB_DATA *key, struct node *node, - struct node *read_node(struct connection *conn, const void *ctx, - const char *name); - -+void setup_structure(bool live_update); - struct connection *new_connection(const struct interface_funcs *funcs); - struct connection *get_connection_by_id(unsigned int conn_id); - void ignore_connection(struct connection *conn); -diff --git a/tools/xenstore/xenstored_domain.c b/tools/xenstore/xenstored_domain.c -index 260952e09096..f04b7aae8a32 100644 ---- a/tools/xenstore/xenstored_domain.c -+++ b/tools/xenstore/xenstored_domain.c -@@ -470,6 +470,9 @@ static struct domain *introduce_domain(const void *ctx, - } - domain->interface = interface; - -+ if (is_master_domain) -+ setup_structure(restore); -+ - /* Now domain belongs to its connection. */ - talloc_steal(domain->conn, domain); - --- -2.37.4 - diff --git a/0056-bump-default-SeaBIOS-version-to-1.16.0.patch b/0056-bump-default-SeaBIOS-version-to-1.16.0.patch new file mode 100644 index 0000000..37d9b67 --- /dev/null +++ b/0056-bump-default-SeaBIOS-version-to-1.16.0.patch @@ -0,0 +1,28 @@ +From 2a4d327387601b60c9844a5b0cc44de28792ea52 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Fri, 6 May 2022 14:46:52 +0200 +Subject: [PATCH 56/61] bump default SeaBIOS version to 1.16.0 + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Julien Grall <jgrall@amazon.com> +(cherry picked from commit 944e389daa133dd310d87c4eebacba9f6da76018) +--- + Config.mk | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/Config.mk b/Config.mk +index 1215c2725b..073715c28d 100644 +--- a/Config.mk ++++ b/Config.mk +@@ -241,7 +241,7 @@ OVMF_UPSTREAM_REVISION ?= 7b4a99be8a39c12d3a7fc4b8db9f0eab4ac688d5 + QEMU_UPSTREAM_REVISION ?= qemu-xen-4.16.3 + MINIOS_UPSTREAM_REVISION ?= xen-RELEASE-4.16.3 + +-SEABIOS_UPSTREAM_REVISION ?= rel-1.14.0 ++SEABIOS_UPSTREAM_REVISION ?= rel-1.16.0 + + ETHERBOOT_NICS ?= rtl8139 8086100e + +-- +2.40.0 + diff --git a/0056-tools-xenstore-add-infrastructure-to-keep-track-of-p.patch b/0056-tools-xenstore-add-infrastructure-to-keep-track-of-p.patch deleted file mode 100644 index b92c61c..0000000 --- a/0056-tools-xenstore-add-infrastructure-to-keep-track-of-p.patch +++ /dev/null @@ -1,289 +0,0 @@ -From 2e406cf5fbb817341dc860473158382057e13de5 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:09 +0200 -Subject: [PATCH 56/87] tools/xenstore: add infrastructure to keep track of per - domain memory usage - -The amount of memory a domain can consume in Xenstore is limited by -various quota today, but even with sane quota a domain can still -consume rather large memory quantities. - -Add the infrastructure for keeping track of the amount of memory a -domain is consuming in Xenstore. Note that this is only the memory a -domain has direct control over, so any internal administration data -needed by Xenstore only is not being accounted for. - -There are two quotas defined: a soft quota which will result in a -warning issued via syslog() when it is exceeded, and a hard quota -resulting in a stop of accepting further requests or watch events as -long as the hard quota would be violated by accepting those. - -Setting any of those quotas to 0 will disable it. - -As default values use 2MB per domain for the soft limit (this basically -covers the allowed case to create 1000 nodes needing 2kB each), and -2.5MB for the hard limit. - -This is part of XSA-326. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit 0d4a8ec7a93faedbe54fd197db146de628459e77) ---- - tools/xenstore/xenstored_core.c | 30 ++++++++-- - tools/xenstore/xenstored_core.h | 2 + - tools/xenstore/xenstored_domain.c | 93 +++++++++++++++++++++++++++++++ - tools/xenstore/xenstored_domain.h | 20 +++++++ - 4 files changed, 139 insertions(+), 6 deletions(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 5171d34c947e..b2bf6740d430 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -109,6 +109,8 @@ int quota_nb_perms_per_node = 5; - int quota_trans_nodes = 1024; - int quota_max_path_len = XENSTORE_REL_PATH_MAX; - int quota_req_outstanding = 20; -+int quota_memory_per_domain_soft = 2 * 1024 * 1024; /* 2 MB */ -+int quota_memory_per_domain_hard = 2 * 1024 * 1024 + 512 * 1024; /* 2.5 MB */ - - unsigned int timeout_watch_event_msec = 20000; - -@@ -2406,7 +2408,14 @@ static void usage(void) - " quotas are:\n" - " transaction-nodes: number of accessed node per\n" - " transaction\n" -+" memory: total used memory per domain for nodes,\n" -+" transactions, watches and requests, above\n" -+" which Xenstore will stop talking to domain\n" - " outstanding: number of outstanding requests\n" -+" -q, --quota-soft <what>=<nb> set a soft quota <what> to the value <nb>,\n" -+" causing a warning to be issued via syslog() if the\n" -+" limit is violated, allowed quotas are:\n" -+" memory: see above\n" - " -w, --timeout <what>=<seconds> set the timeout in seconds for <what>,\n" - " allowed timeout candidates are:\n" - " watch-event: time a watch-event is kept pending\n" -@@ -2433,6 +2442,7 @@ static struct option options[] = { - { "perm-nb", 1, NULL, 'A' }, - { "path-max", 1, NULL, 'M' }, - { "quota", 1, NULL, 'Q' }, -+ { "quota-soft", 1, NULL, 'q' }, - { "timeout", 1, NULL, 'w' }, - { "no-recovery", 0, NULL, 'R' }, - { "internal-db", 0, NULL, 'I' }, -@@ -2480,7 +2490,7 @@ static void set_timeout(const char *arg) - barf("unknown timeout \"%s\"\n", arg); - } - --static void set_quota(const char *arg) -+static void set_quota(const char *arg, bool soft) - { - const char *eq = strchr(arg, '='); - int val; -@@ -2488,11 +2498,16 @@ static void set_quota(const char *arg) - if (!eq) - barf("quotas must be specified via <what>=<nb>\n"); - val = get_optval_int(eq + 1); -- if (what_matches(arg, "outstanding")) -+ if (what_matches(arg, "outstanding") && !soft) - quota_req_outstanding = val; -- else if (what_matches(arg, "transaction-nodes")) -+ else if (what_matches(arg, "transaction-nodes") && !soft) - quota_trans_nodes = val; -- else -+ else if (what_matches(arg, "memory")) { -+ if (soft) -+ quota_memory_per_domain_soft = val; -+ else -+ quota_memory_per_domain_hard = val; -+ } else - barf("unknown quota \"%s\"\n", arg); - } - -@@ -2510,7 +2525,7 @@ int main(int argc, char *argv[]) - orig_argc = argc; - orig_argv = argv; - -- while ((opt = getopt_long(argc, argv, "DE:F:HNPS:t:A:M:Q:T:RVW:w:U", -+ while ((opt = getopt_long(argc, argv, "DE:F:HNPS:t:A:M:Q:q:T:RVW:w:U", - options, NULL)) != -1) { - switch (opt) { - case 'D': -@@ -2561,7 +2576,10 @@ int main(int argc, char *argv[]) - quota_max_path_len); - break; - case 'Q': -- set_quota(optarg); -+ set_quota(optarg, false); -+ break; -+ case 'q': -+ set_quota(optarg, true); - break; - case 'w': - set_timeout(optarg); -diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h -index 2c77ec7ee0f4..373af18297bf 100644 ---- a/tools/xenstore/xenstored_core.h -+++ b/tools/xenstore/xenstored_core.h -@@ -270,6 +270,8 @@ extern int priv_domid; - extern int quota_nb_entry_per_domain; - extern int quota_req_outstanding; - extern int quota_trans_nodes; -+extern int quota_memory_per_domain_soft; -+extern int quota_memory_per_domain_hard; - - extern unsigned int timeout_watch_event_msec; - -diff --git a/tools/xenstore/xenstored_domain.c b/tools/xenstore/xenstored_domain.c -index f04b7aae8a32..94fd561e9de4 100644 ---- a/tools/xenstore/xenstored_domain.c -+++ b/tools/xenstore/xenstored_domain.c -@@ -76,6 +76,13 @@ struct domain - /* number of entry from this domain in the store */ - int nbentry; - -+ /* Amount of memory allocated for this domain. */ -+ int memory; -+ bool soft_quota_reported; -+ bool hard_quota_reported; -+ time_t mem_last_msg; -+#define MEM_WARN_MINTIME_SEC 10 -+ - /* number of watch for this domain */ - int nbwatch; - -@@ -192,6 +199,9 @@ static bool domain_can_read(struct connection *conn) - return false; - if (conn->domain->nboutstanding >= quota_req_outstanding) - return false; -+ if (conn->domain->memory >= quota_memory_per_domain_hard && -+ quota_memory_per_domain_hard) -+ return false; - } - - return (intf->req_cons != intf->req_prod); -@@ -950,6 +960,89 @@ int domain_entry(struct connection *conn) - : 0; - } - -+static bool domain_chk_quota(struct domain *domain, int mem) -+{ -+ time_t now; -+ -+ if (!domain || !domid_is_unprivileged(domain->domid) || -+ (domain->conn && domain->conn->is_ignored)) -+ return false; -+ -+ now = time(NULL); -+ -+ if (mem >= quota_memory_per_domain_hard && -+ quota_memory_per_domain_hard) { -+ if (domain->hard_quota_reported) -+ return true; -+ syslog(LOG_ERR, "Domain %u exceeds hard memory quota, Xenstore interface to domain stalled\n", -+ domain->domid); -+ domain->mem_last_msg = now; -+ domain->hard_quota_reported = true; -+ return true; -+ } -+ -+ if (now - domain->mem_last_msg >= MEM_WARN_MINTIME_SEC) { -+ if (domain->hard_quota_reported) { -+ domain->mem_last_msg = now; -+ domain->hard_quota_reported = false; -+ syslog(LOG_INFO, "Domain %u below hard memory quota again\n", -+ domain->domid); -+ } -+ if (mem >= quota_memory_per_domain_soft && -+ quota_memory_per_domain_soft && -+ !domain->soft_quota_reported) { -+ domain->mem_last_msg = now; -+ domain->soft_quota_reported = true; -+ syslog(LOG_WARNING, "Domain %u exceeds soft memory quota\n", -+ domain->domid); -+ } -+ if (mem < quota_memory_per_domain_soft && -+ domain->soft_quota_reported) { -+ domain->mem_last_msg = now; -+ domain->soft_quota_reported = false; -+ syslog(LOG_INFO, "Domain %u below soft memory quota again\n", -+ domain->domid); -+ } -+ -+ } -+ -+ return false; -+} -+ -+int domain_memory_add(unsigned int domid, int mem, bool no_quota_check) -+{ -+ struct domain *domain; -+ -+ domain = find_domain_struct(domid); -+ if (domain) { -+ /* -+ * domain_chk_quota() will print warning and also store whether -+ * the soft/hard quota has been hit. So check no_quota_check -+ * *after*. -+ */ -+ if (domain_chk_quota(domain, domain->memory + mem) && -+ !no_quota_check) -+ return ENOMEM; -+ domain->memory += mem; -+ } else { -+ /* -+ * The domain the memory is to be accounted for should always -+ * exist, as accounting is done either for a domain related to -+ * the current connection, or for the domain owning a node -+ * (which is always existing, as the owner of the node is -+ * tested to exist and replaced by domid 0 if not). -+ * So not finding the related domain MUST be an error in the -+ * data base. -+ */ -+ errno = ENOENT; -+ corrupt(NULL, "Accounting called for non-existing domain %u\n", -+ domid); -+ return ENOENT; -+ } -+ -+ return 0; -+} -+ - void domain_watch_inc(struct connection *conn) - { - if (!conn || !conn->domain) -diff --git a/tools/xenstore/xenstored_domain.h b/tools/xenstore/xenstored_domain.h -index d6519904d831..633c9a0a0a1f 100644 ---- a/tools/xenstore/xenstored_domain.h -+++ b/tools/xenstore/xenstored_domain.h -@@ -61,6 +61,26 @@ int domain_entry_inc(struct connection *conn, struct node *); - void domain_entry_dec(struct connection *conn, struct node *); - int domain_entry_fix(unsigned int domid, int num, bool update); - int domain_entry(struct connection *conn); -+int domain_memory_add(unsigned int domid, int mem, bool no_quota_check); -+ -+/* -+ * domain_memory_add_chk(): to be used when memory quota should be checked. -+ * Not to be used when specifying a negative mem value, as lowering the used -+ * memory should always be allowed. -+ */ -+static inline int domain_memory_add_chk(unsigned int domid, int mem) -+{ -+ return domain_memory_add(domid, mem, false); -+} -+/* -+ * domain_memory_add_nochk(): to be used when memory quota should not be -+ * checked, e.g. when lowering memory usage, or in an error case for undoing -+ * a previous memory adjustment. -+ */ -+static inline void domain_memory_add_nochk(unsigned int domid, int mem) -+{ -+ domain_memory_add(domid, mem, true); -+} - void domain_watch_inc(struct connection *conn); - void domain_watch_dec(struct connection *conn); - int domain_watch(struct connection *conn); --- -2.37.4 - diff --git a/0057-CI-Drop-automation-configs.patch b/0057-CI-Drop-automation-configs.patch new file mode 100644 index 0000000..d726468 --- /dev/null +++ b/0057-CI-Drop-automation-configs.patch @@ -0,0 +1,87 @@ +From 657dc5f5f6269008fd7484ca7cca723e21455483 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper <andrew.cooper3@citrix.com> +Date: Thu, 29 Dec 2022 15:39:13 +0000 +Subject: [PATCH 57/61] CI: Drop automation/configs/ + +Having 3 extra hypervisor builds on the end of a full build is deeply +confusing to debug if one of them fails, because the .config file presented in +the artefacts is not the one which caused a build failure. Also, the log +tends to be truncated in the UI. + +PV-only is tested as part of PV-Shim in a full build anyway, so doesn't need +repeating. HVM-only and neither appear frequently in randconfig, so drop all +the logic here to simplify things. + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Michal Orzel <michal.orzel@amd.com> +Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> +(cherry picked from commit 7b20009a812f26e74bdbde2ab96165376b3dad34) +--- + automation/configs/x86/hvm_only_config | 3 --- + automation/configs/x86/no_hvm_pv_config | 3 --- + automation/configs/x86/pv_only_config | 3 --- + automation/scripts/build | 21 --------------------- + 4 files changed, 30 deletions(-) + delete mode 100644 automation/configs/x86/hvm_only_config + delete mode 100644 automation/configs/x86/no_hvm_pv_config + delete mode 100644 automation/configs/x86/pv_only_config + +diff --git a/automation/configs/x86/hvm_only_config b/automation/configs/x86/hvm_only_config +deleted file mode 100644 +index 9efbddd535..0000000000 +--- a/automation/configs/x86/hvm_only_config ++++ /dev/null +@@ -1,3 +0,0 @@ +-CONFIG_HVM=y +-# CONFIG_PV is not set +-# CONFIG_DEBUG is not set +diff --git a/automation/configs/x86/no_hvm_pv_config b/automation/configs/x86/no_hvm_pv_config +deleted file mode 100644 +index 0bf6a8e468..0000000000 +--- a/automation/configs/x86/no_hvm_pv_config ++++ /dev/null +@@ -1,3 +0,0 @@ +-# CONFIG_HVM is not set +-# CONFIG_PV is not set +-# CONFIG_DEBUG is not set +diff --git a/automation/configs/x86/pv_only_config b/automation/configs/x86/pv_only_config +deleted file mode 100644 +index e9d8b4a7c7..0000000000 +--- a/automation/configs/x86/pv_only_config ++++ /dev/null +@@ -1,3 +0,0 @@ +-CONFIG_PV=y +-# CONFIG_HVM is not set +-# CONFIG_DEBUG is not set +diff --git a/automation/scripts/build b/automation/scripts/build +index 281f8b1fcc..2c807fa397 100755 +--- a/automation/scripts/build ++++ b/automation/scripts/build +@@ -73,24 +73,3 @@ if [[ "${XEN_TARGET_ARCH}" != "x86_32" ]]; then + cp -r dist binaries/ + fi + fi +- +-if [[ "${hypervisor_only}" == "y" ]]; then +- # If we are build testing a specific Kconfig exit now, there's no point in +- # testing all the possible configs. +- exit 0 +-fi +- +-# Build all the configs we care about +-case ${XEN_TARGET_ARCH} in +- x86_64) arch=x86 ;; +- *) exit 0 ;; +-esac +- +-cfg_dir="automation/configs/${arch}" +-for cfg in `ls ${cfg_dir}`; do +- echo "Building $cfg" +- make -j$(nproc) -C xen clean +- rm -f xen/.config +- make -C xen KBUILD_DEFCONFIG=../../../../${cfg_dir}/${cfg} XEN_CONFIG_EXPERT=y defconfig +- make -j$(nproc) -C xen XEN_CONFIG_EXPERT=y +-done +-- +2.40.0 + diff --git a/0057-tools-xenstore-add-memory-accounting-for-responses.patch b/0057-tools-xenstore-add-memory-accounting-for-responses.patch deleted file mode 100644 index 9dd565d..0000000 --- a/0057-tools-xenstore-add-memory-accounting-for-responses.patch +++ /dev/null @@ -1,82 +0,0 @@ -From 30c8e752f66f681b5c731a637c26510ae5f35965 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:09 +0200 -Subject: [PATCH 57/87] tools/xenstore: add memory accounting for responses - -Add the memory accounting for queued responses. - -In case adding a watch event for a guest is causing the hard memory -quota of that guest to be violated, the event is dropped. This will -ensure that it is impossible to drive another guest past its memory -quota by generating insane amounts of events for that guest. This is -especially important for protecting driver domains from that attack -vector. - -This is part of XSA-326 / CVE-2022-42315. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit f6d00133643a524d2138c9e3f192bbde719050ba) ---- - tools/xenstore/xenstored_core.c | 22 +++++++++++++++++++--- - 1 file changed, 19 insertions(+), 3 deletions(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index b2bf6740d430..ecab6cfbbe15 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -260,6 +260,8 @@ static void free_buffered_data(struct buffered_data *out, - } - } - -+ domain_memory_add_nochk(conn->id, -out->hdr.msg.len - sizeof(out->hdr)); -+ - if (out->hdr.msg.type == XS_WATCH_EVENT) { - req = out->pend.req; - if (req) { -@@ -938,11 +940,14 @@ void send_reply(struct connection *conn, enum xsd_sockmsg_type type, - bdata->timeout_msec = 0; - bdata->watch_event = false; - -- if (len <= DEFAULT_BUFFER_SIZE) -+ if (len <= DEFAULT_BUFFER_SIZE) { - bdata->buffer = bdata->default_buffer; -- else { -+ /* Don't check quota, path might be used for returning error. */ -+ domain_memory_add_nochk(conn->id, len + sizeof(bdata->hdr)); -+ } else { - bdata->buffer = talloc_array(bdata, char, len); -- if (!bdata->buffer) { -+ if (!bdata->buffer || -+ domain_memory_add_chk(conn->id, len + sizeof(bdata->hdr))) { - send_error(conn, ENOMEM); - return; - } -@@ -1007,6 +1012,11 @@ void send_event(struct buffered_data *req, struct connection *conn, - } - } - -+ if (domain_memory_add_chk(conn->id, len + sizeof(bdata->hdr))) { -+ talloc_free(bdata); -+ return; -+ } -+ - if (timeout_watch_event_msec && domain_is_unprivileged(conn)) { - bdata->timeout_msec = get_now_msec() + timeout_watch_event_msec; - if (!conn->timeout_msec) -@@ -3039,6 +3049,12 @@ static void add_buffered_data(struct buffered_data *bdata, - */ - if (bdata->hdr.msg.type != XS_WATCH_EVENT) - domain_outstanding_inc(conn); -+ /* -+ * We are restoring the state after Live-Update and the new quota may -+ * be smaller. So ignore it. The limit will be applied for any resource -+ * after the state has been fully restored. -+ */ -+ domain_memory_add_nochk(conn->id, len + sizeof(bdata->hdr)); - } - - void read_state_buffered_data(const void *ctx, struct connection *conn, --- -2.37.4 - diff --git a/0058-automation-Switch-arm32-cross-builds-to-run-on-arm64.patch b/0058-automation-Switch-arm32-cross-builds-to-run-on-arm64.patch new file mode 100644 index 0000000..92d65ec --- /dev/null +++ b/0058-automation-Switch-arm32-cross-builds-to-run-on-arm64.patch @@ -0,0 +1,87 @@ +From 37800cf8ab7806e506b96a13cad0fb395d86663a Mon Sep 17 00:00:00 2001 +From: Michal Orzel <michal.orzel@amd.com> +Date: Tue, 14 Feb 2023 16:38:38 +0100 +Subject: [PATCH 58/61] automation: Switch arm32 cross builds to run on arm64 + +Due to the limited x86 CI resources slowing down the whole pipeline, +switch the arm32 cross builds to be executed on arm64 which is much more +capable. For that, rename the existing debian container dockerfile +from unstable-arm32-gcc to unstable-arm64v8-arm32-gcc and use +arm64v8/debian:unstable as an image. Note, that we cannot use the same +container name as we have to keep the backwards compatibility. +Take the opportunity to remove extra empty line at the end of a file. + +Modify the tag of .arm32-cross-build-tmpl to arm64 and update the build +jobs accordingly. + +Signed-off-by: Michal Orzel <michal.orzel@amd.com> +Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> +(cherry picked from commit a35fccc8df93de7154dba87db6e7bcf391e9d51c) +--- + ...ockerfile => unstable-arm64v8-arm32-gcc.dockerfile} | 3 +-- + automation/gitlab-ci/build.yaml | 10 +++++----- + 2 files changed, 6 insertions(+), 7 deletions(-) + rename automation/build/debian/{unstable-arm32-gcc.dockerfile => unstable-arm64v8-arm32-gcc.dockerfile} (94%) + +diff --git a/automation/build/debian/unstable-arm32-gcc.dockerfile b/automation/build/debian/unstable-arm64v8-arm32-gcc.dockerfile +similarity index 94% +rename from automation/build/debian/unstable-arm32-gcc.dockerfile +rename to automation/build/debian/unstable-arm64v8-arm32-gcc.dockerfile +index b41a57f197..11860425a6 100644 +--- a/automation/build/debian/unstable-arm32-gcc.dockerfile ++++ b/automation/build/debian/unstable-arm64v8-arm32-gcc.dockerfile +@@ -1,4 +1,4 @@ +-FROM debian:unstable ++FROM arm64v8/debian:unstable + LABEL maintainer.name="The Xen Project" \ + maintainer.email="xen-devel@lists.xenproject.org" + +@@ -21,4 +21,3 @@ RUN apt-get update && \ + apt-get autoremove -y && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists* /tmp/* /var/tmp/* +- +diff --git a/automation/gitlab-ci/build.yaml b/automation/gitlab-ci/build.yaml +index 06a75a8c5a..f66fbca8a7 100644 +--- a/automation/gitlab-ci/build.yaml ++++ b/automation/gitlab-ci/build.yaml +@@ -123,7 +123,7 @@ + variables: + XEN_TARGET_ARCH: arm32 + tags: +- - x86_64 ++ - arm64 + + .arm32-cross-build: + extends: .arm32-cross-build-tmpl +@@ -497,23 +497,23 @@ alpine-3.12-clang-debug: + debian-unstable-gcc-arm32: + extends: .gcc-arm32-cross-build + variables: +- CONTAINER: debian:unstable-arm32-gcc ++ CONTAINER: debian:unstable-arm64v8-arm32-gcc + + debian-unstable-gcc-arm32-debug: + extends: .gcc-arm32-cross-build-debug + variables: +- CONTAINER: debian:unstable-arm32-gcc ++ CONTAINER: debian:unstable-arm64v8-arm32-gcc + + debian-unstable-gcc-arm32-randconfig: + extends: .gcc-arm32-cross-build + variables: +- CONTAINER: debian:unstable-arm32-gcc ++ CONTAINER: debian:unstable-arm64v8-arm32-gcc + RANDCONFIG: y + + debian-unstable-gcc-arm32-debug-randconfig: + extends: .gcc-arm32-cross-build-debug + variables: +- CONTAINER: debian:unstable-arm32-gcc ++ CONTAINER: debian:unstable-arm64v8-arm32-gcc + RANDCONFIG: y + + # Arm builds +-- +2.40.0 + diff --git a/0058-tools-xenstore-add-memory-accounting-for-watches.patch b/0058-tools-xenstore-add-memory-accounting-for-watches.patch deleted file mode 100644 index dc6b80c..0000000 --- a/0058-tools-xenstore-add-memory-accounting-for-watches.patch +++ /dev/null @@ -1,96 +0,0 @@ -From bce985745cde48a339954759677b77d3eeec41f3 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:10 +0200 -Subject: [PATCH 58/87] tools/xenstore: add memory accounting for watches - -Add the memory accounting for registered watches. - -When a socket connection is destroyed, the associated watches are -removed, too. In order to keep memory accounting correct the watches -must be removed explicitly via a call of conn_delete_all_watches() from -destroy_conn(). - -This is part of XSA-326 / CVE-2022-42315. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit 7f9978a2cc37aaffab2fb09593bc598c0712a69b) ---- - tools/xenstore/xenstored_core.c | 1 + - tools/xenstore/xenstored_watch.c | 13 ++++++++++--- - 2 files changed, 11 insertions(+), 3 deletions(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index ecab6cfbbe15..d86942f5aa77 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -463,6 +463,7 @@ static int destroy_conn(void *_conn) - } - - conn_free_buffered_data(conn); -+ conn_delete_all_watches(conn); - list_for_each_entry(req, &conn->ref_list, list) - req->on_ref_list = false; - -diff --git a/tools/xenstore/xenstored_watch.c b/tools/xenstore/xenstored_watch.c -index 0755ffa375ba..fdf9b2d653a0 100644 ---- a/tools/xenstore/xenstored_watch.c -+++ b/tools/xenstore/xenstored_watch.c -@@ -211,7 +211,7 @@ static int check_watch_path(struct connection *conn, const void *ctx, - } - - static struct watch *add_watch(struct connection *conn, char *path, char *token, -- bool relative) -+ bool relative, bool no_quota_check) - { - struct watch *watch; - -@@ -222,6 +222,9 @@ static struct watch *add_watch(struct connection *conn, char *path, char *token, - watch->token = talloc_strdup(watch, token); - if (!watch->node || !watch->token) - goto nomem; -+ if (domain_memory_add(conn->id, strlen(path) + strlen(token), -+ no_quota_check)) -+ goto nomem; - - if (relative) - watch->relative_path = get_implicit_path(conn); -@@ -265,7 +268,7 @@ int do_watch(struct connection *conn, struct buffered_data *in) - if (domain_watch(conn) > quota_nb_watch_per_domain) - return E2BIG; - -- watch = add_watch(conn, vec[0], vec[1], relative); -+ watch = add_watch(conn, vec[0], vec[1], relative, false); - if (!watch) - return errno; - -@@ -296,6 +299,8 @@ int do_unwatch(struct connection *conn, struct buffered_data *in) - list_for_each_entry(watch, &conn->watches, list) { - if (streq(watch->node, node) && streq(watch->token, vec[1])) { - list_del(&watch->list); -+ domain_memory_add_nochk(conn->id, -strlen(watch->node) - -+ strlen(watch->token)); - talloc_free(watch); - domain_watch_dec(conn); - send_ack(conn, XS_UNWATCH); -@@ -311,6 +316,8 @@ void conn_delete_all_watches(struct connection *conn) - - while ((watch = list_top(&conn->watches, struct watch, list))) { - list_del(&watch->list); -+ domain_memory_add_nochk(conn->id, -strlen(watch->node) - -+ strlen(watch->token)); - talloc_free(watch); - domain_watch_dec(conn); - } -@@ -373,7 +380,7 @@ void read_state_watch(const void *ctx, const void *state) - if (!path) - barf("allocation error for read watch"); - -- if (!add_watch(conn, path, token, relative)) -+ if (!add_watch(conn, path, token, relative, true)) - barf("error adding watch"); - } - --- -2.37.4 - diff --git a/0059-automation-Remove-CentOS-7.2-containers-and-builds.patch b/0059-automation-Remove-CentOS-7.2-containers-and-builds.patch new file mode 100644 index 0000000..8d58eea --- /dev/null +++ b/0059-automation-Remove-CentOS-7.2-containers-and-builds.patch @@ -0,0 +1,145 @@ +From a4d901580b2ab3133bca13159b790914c217b0e2 Mon Sep 17 00:00:00 2001 +From: Anthony PERARD <anthony.perard@citrix.com> +Date: Tue, 21 Feb 2023 16:55:36 +0000 +Subject: [PATCH 59/61] automation: Remove CentOS 7.2 containers and builds + +We already have a container which track the latest CentOS 7, no need +for this one as well. + +Also, 7.2 have outdated root certificate which prevent connection to +website which use Let's Encrypt. + +Signed-off-by: Anthony PERARD <anthony.perard@citrix.com> +Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> +(cherry picked from commit ba512629f76dfddb39ea9133ee51cdd9e392a927) +--- + automation/build/centos/7.2.dockerfile | 52 ------------------------- + automation/build/centos/CentOS-7.2.repo | 35 ----------------- + automation/gitlab-ci/build.yaml | 10 ----- + 3 files changed, 97 deletions(-) + delete mode 100644 automation/build/centos/7.2.dockerfile + delete mode 100644 automation/build/centos/CentOS-7.2.repo + +diff --git a/automation/build/centos/7.2.dockerfile b/automation/build/centos/7.2.dockerfile +deleted file mode 100644 +index 4baa097e31..0000000000 +--- a/automation/build/centos/7.2.dockerfile ++++ /dev/null +@@ -1,52 +0,0 @@ +-FROM centos:7.2.1511 +-LABEL maintainer.name="The Xen Project" \ +- maintainer.email="xen-devel@lists.xenproject.org" +- +-# ensure we only get bits from the vault for +-# the version we want +-COPY CentOS-7.2.repo /etc/yum.repos.d/CentOS-Base.repo +- +-# install EPEL for dev86, xz-devel and possibly other packages +-RUN yum -y install https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm && \ +- yum clean all +- +-RUN mkdir /build +-WORKDIR /build +- +-# work around https://github.com/moby/moby/issues/10180 +-# and install Xen depends +-RUN rpm --rebuilddb && \ +- yum -y install \ +- yum-plugin-ovl \ +- gcc \ +- gcc-c++ \ +- ncurses-devel \ +- zlib-devel \ +- openssl-devel \ +- python-devel \ +- libuuid-devel \ +- pkgconfig \ +- # gettext for Xen < 4.13 +- gettext \ +- flex \ +- bison \ +- libaio-devel \ +- glib2-devel \ +- yajl-devel \ +- pixman-devel \ +- glibc-devel \ +- # glibc-devel.i686 for Xen < 4.15 +- glibc-devel.i686 \ +- make \ +- binutils \ +- git \ +- wget \ +- acpica-tools \ +- python-markdown \ +- patch \ +- checkpolicy \ +- dev86 \ +- xz-devel \ +- bzip2 \ +- nasm \ +- && yum clean all +diff --git a/automation/build/centos/CentOS-7.2.repo b/automation/build/centos/CentOS-7.2.repo +deleted file mode 100644 +index 4da27faeb5..0000000000 +--- a/automation/build/centos/CentOS-7.2.repo ++++ /dev/null +@@ -1,35 +0,0 @@ +-# CentOS-Base.repo +-# +-# This is a replacement file that pins things to just use CentOS 7.2 +-# from the CentOS Vault. +-# +- +-[base] +-name=CentOS-7.2.1511 - Base +-baseurl=http://vault.centos.org/7.2.1511/os/$basearch/ +-gpgcheck=1 +-gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-7 +- +-#released updates +-[updates] +-name=CentOS-7.2.1511 - Updates +-baseurl=http://vault.centos.org/7.2.1511/updates/$basearch/ +-gpgcheck=1 +-gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-7 +- +-#additional packages that may be useful +-[extras] +-name=CentOS-7.2.1511 - Extras +-baseurl=http://vault.centos.org/7.2.1511/extras/$basearch/ +-gpgcheck=1 +-gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-7 +- +-#additional packages that extend functionality of existing packages +-[centosplus] +-name=CentOS-7.2.1511 - Plus +-baseurl=http://vault.centos.org/7.2.1511/centosplus/$basearch/ +-gpgcheck=1 +-gpgcheck=1 +-enabled=0 +-gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-7 +- +diff --git a/automation/gitlab-ci/build.yaml b/automation/gitlab-ci/build.yaml +index f66fbca8a7..bc1a732069 100644 +--- a/automation/gitlab-ci/build.yaml ++++ b/automation/gitlab-ci/build.yaml +@@ -184,16 +184,6 @@ archlinux-gcc-debug: + variables: + CONTAINER: archlinux:current + +-centos-7-2-gcc: +- extends: .gcc-x86-64-build +- variables: +- CONTAINER: centos:7.2 +- +-centos-7-2-gcc-debug: +- extends: .gcc-x86-64-build-debug +- variables: +- CONTAINER: centos:7.2 +- + centos-7-gcc: + extends: .gcc-x86-64-build + variables: +-- +2.40.0 + diff --git a/0059-tools-xenstore-add-memory-accounting-for-nodes.patch b/0059-tools-xenstore-add-memory-accounting-for-nodes.patch deleted file mode 100644 index a1ab308..0000000 --- a/0059-tools-xenstore-add-memory-accounting-for-nodes.patch +++ /dev/null @@ -1,342 +0,0 @@ -From 578d422af0b444a9e437dd0ceddf2049364f1a40 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:10 +0200 -Subject: [PATCH 59/87] tools/xenstore: add memory accounting for nodes - -Add the memory accounting for Xenstore nodes. In order to make this -not too complicated allow for some sloppiness when writing nodes. Any -hard quota violation will result in no further requests to be accepted. - -This is part of XSA-326 / CVE-2022-42315. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit 00e9e32d022be1afc144b75acdaeba8393e63315) ---- - tools/xenstore/xenstored_core.c | 140 ++++++++++++++++++++++--- - tools/xenstore/xenstored_core.h | 12 +++ - tools/xenstore/xenstored_transaction.c | 16 ++- - 3 files changed, 151 insertions(+), 17 deletions(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index d86942f5aa77..16504de42017 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -591,6 +591,117 @@ void set_tdb_key(const char *name, TDB_DATA *key) - key->dsize = strlen(name); - } - -+static void get_acc_data(TDB_DATA *key, struct node_account_data *acc) -+{ -+ TDB_DATA old_data; -+ struct xs_tdb_record_hdr *hdr; -+ -+ if (acc->memory < 0) { -+ old_data = tdb_fetch(tdb_ctx, *key); -+ /* No check for error, as the node might not exist. */ -+ if (old_data.dptr == NULL) { -+ acc->memory = 0; -+ } else { -+ hdr = (void *)old_data.dptr; -+ acc->memory = old_data.dsize; -+ acc->domid = hdr->perms[0].id; -+ } -+ talloc_free(old_data.dptr); -+ } -+} -+ -+/* -+ * Per-transaction nodes need to be accounted for the transaction owner. -+ * Those nodes are stored in the data base with the transaction generation -+ * count prepended (e.g. 123/local/domain/...). So testing for the node's -+ * key not to start with "/" is sufficient. -+ */ -+static unsigned int get_acc_domid(struct connection *conn, TDB_DATA *key, -+ unsigned int domid) -+{ -+ return (!conn || key->dptr[0] == '/') ? domid : conn->id; -+} -+ -+int do_tdb_write(struct connection *conn, TDB_DATA *key, TDB_DATA *data, -+ struct node_account_data *acc, bool no_quota_check) -+{ -+ struct xs_tdb_record_hdr *hdr = (void *)data->dptr; -+ struct node_account_data old_acc = {}; -+ unsigned int old_domid, new_domid; -+ int ret; -+ -+ if (!acc) -+ old_acc.memory = -1; -+ else -+ old_acc = *acc; -+ -+ get_acc_data(key, &old_acc); -+ old_domid = get_acc_domid(conn, key, old_acc.domid); -+ new_domid = get_acc_domid(conn, key, hdr->perms[0].id); -+ -+ /* -+ * Don't check for ENOENT, as we want to be able to switch orphaned -+ * nodes to new owners. -+ */ -+ if (old_acc.memory) -+ domain_memory_add_nochk(old_domid, -+ -old_acc.memory - key->dsize); -+ ret = domain_memory_add(new_domid, data->dsize + key->dsize, -+ no_quota_check); -+ if (ret) { -+ /* Error path, so no quota check. */ -+ if (old_acc.memory) -+ domain_memory_add_nochk(old_domid, -+ old_acc.memory + key->dsize); -+ return ret; -+ } -+ -+ /* TDB should set errno, but doesn't even set ecode AFAICT. */ -+ if (tdb_store(tdb_ctx, *key, *data, TDB_REPLACE) != 0) { -+ domain_memory_add_nochk(new_domid, -data->dsize - key->dsize); -+ /* Error path, so no quota check. */ -+ if (old_acc.memory) -+ domain_memory_add_nochk(old_domid, -+ old_acc.memory + key->dsize); -+ errno = EIO; -+ return errno; -+ } -+ -+ if (acc) { -+ /* Don't use new_domid, as it might be a transaction node. */ -+ acc->domid = hdr->perms[0].id; -+ acc->memory = data->dsize; -+ } -+ -+ return 0; -+} -+ -+int do_tdb_delete(struct connection *conn, TDB_DATA *key, -+ struct node_account_data *acc) -+{ -+ struct node_account_data tmp_acc; -+ unsigned int domid; -+ -+ if (!acc) { -+ acc = &tmp_acc; -+ acc->memory = -1; -+ } -+ -+ get_acc_data(key, acc); -+ -+ if (tdb_delete(tdb_ctx, *key)) { -+ errno = EIO; -+ return errno; -+ } -+ -+ if (acc->memory) { -+ domid = get_acc_domid(conn, key, acc->domid); -+ domain_memory_add_nochk(domid, -acc->memory - key->dsize); -+ } -+ -+ return 0; -+} -+ - /* - * If it fails, returns NULL and sets errno. - * Temporary memory allocations will be done with ctx. -@@ -644,9 +755,15 @@ struct node *read_node(struct connection *conn, const void *ctx, - - /* Permissions are struct xs_permissions. */ - node->perms.p = hdr->perms; -+ node->acc.domid = node->perms.p[0].id; -+ node->acc.memory = data.dsize; - if (domain_adjust_node_perms(conn, node)) - goto error; - -+ /* If owner is gone reset currently accounted memory size. */ -+ if (node->acc.domid != node->perms.p[0].id) -+ node->acc.memory = 0; -+ - /* Data is binary blob (usually ascii, no nul). */ - node->data = node->perms.p + hdr->num_perms; - /* Children is strings, nul separated. */ -@@ -715,12 +832,9 @@ int write_node_raw(struct connection *conn, TDB_DATA *key, struct node *node, - p += node->datalen; - memcpy(p, node->children, node->childlen); - -- /* TDB should set errno, but doesn't even set ecode AFAICT. */ -- if (tdb_store(tdb_ctx, *key, data, TDB_REPLACE) != 0) { -- corrupt(conn, "Write of %s failed", key->dptr); -- errno = EIO; -- return errno; -- } -+ if (do_tdb_write(conn, key, &data, &node->acc, no_quota_check)) -+ return EIO; -+ - return 0; - } - -@@ -1222,7 +1336,7 @@ static void delete_node_single(struct connection *conn, struct node *node) - if (access_node(conn, node, NODE_ACCESS_DELETE, &key)) - return; - -- if (tdb_delete(tdb_ctx, key) != 0) { -+ if (do_tdb_delete(conn, &key, &node->acc) != 0) { - corrupt(conn, "Could not delete '%s'", node->name); - return; - } -@@ -1295,6 +1409,7 @@ static struct node *construct_node(struct connection *conn, const void *ctx, - /* No children, no data */ - node->children = node->data = NULL; - node->childlen = node->datalen = 0; -+ node->acc.memory = 0; - node->parent = parent; - return node; - -@@ -1303,17 +1418,17 @@ nomem: - return NULL; - } - --static void destroy_node_rm(struct node *node) -+static void destroy_node_rm(struct connection *conn, struct node *node) - { - if (streq(node->name, "/")) - corrupt(NULL, "Destroying root node!"); - -- tdb_delete(tdb_ctx, node->key); -+ do_tdb_delete(conn, &node->key, &node->acc); - } - - static int destroy_node(struct connection *conn, struct node *node) - { -- destroy_node_rm(node); -+ destroy_node_rm(conn, node); - domain_entry_dec(conn, node); - - /* -@@ -1365,7 +1480,7 @@ static struct node *create_node(struct connection *conn, const void *ctx, - /* Account for new node */ - if (i->parent) { - if (domain_entry_inc(conn, i)) { -- destroy_node_rm(i); -+ destroy_node_rm(conn, i); - return NULL; - } - } -@@ -2291,7 +2406,7 @@ static int clean_store_(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA val, - if (!hashtable_search(reachable, name)) { - log("clean_store: '%s' is orphaned!", name); - if (recovery) { -- tdb_delete(tdb, key); -+ do_tdb_delete(NULL, &key, NULL); - } - } - -@@ -3149,6 +3264,7 @@ void read_state_node(const void *ctx, const void *state) - if (!node) - barf("allocation error restoring node"); - -+ node->acc.memory = 0; - node->name = name; - node->generation = ++generation; - node->datalen = sn->data_len; -diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h -index 373af18297bf..da9ecce67f31 100644 ---- a/tools/xenstore/xenstored_core.h -+++ b/tools/xenstore/xenstored_core.h -@@ -176,6 +176,11 @@ struct node_perms { - struct xs_permissions *p; - }; - -+struct node_account_data { -+ unsigned int domid; -+ int memory; /* -1 if unknown */ -+}; -+ - struct node { - const char *name; - /* Key used to update TDB */ -@@ -198,6 +203,9 @@ struct node { - /* Children, each nul-terminated. */ - unsigned int childlen; - char *children; -+ -+ /* Allocation information for node currently in store. */ -+ struct node_account_data acc; - }; - - /* Return the only argument in the input. */ -@@ -306,6 +314,10 @@ extern xengnttab_handle **xgt_handle; - int remember_string(struct hashtable *hash, const char *str); - - void set_tdb_key(const char *name, TDB_DATA *key); -+int do_tdb_write(struct connection *conn, TDB_DATA *key, TDB_DATA *data, -+ struct node_account_data *acc, bool no_quota_check); -+int do_tdb_delete(struct connection *conn, TDB_DATA *key, -+ struct node_account_data *acc); - - void conn_free_buffered_data(struct connection *conn); - -diff --git a/tools/xenstore/xenstored_transaction.c b/tools/xenstore/xenstored_transaction.c -index 7bd41eb475e3..ace9a11d77bb 100644 ---- a/tools/xenstore/xenstored_transaction.c -+++ b/tools/xenstore/xenstored_transaction.c -@@ -153,6 +153,9 @@ struct transaction - /* List of all transactions active on this connection. */ - struct list_head list; - -+ /* Connection this transaction is associated with. */ -+ struct connection *conn; -+ - /* Connection-local identifier for this transaction. */ - uint32_t id; - -@@ -286,6 +289,8 @@ int access_node(struct connection *conn, struct node *node, - - introduce = true; - i->ta_node = false; -+ /* acc.memory < 0 means "unknown, get size from TDB". */ -+ node->acc.memory = -1; - - /* - * Additional transaction-specific node for read type. We only -@@ -410,11 +415,11 @@ static int finalize_transaction(struct connection *conn, - goto err; - hdr = (void *)data.dptr; - hdr->generation = ++generation; -- ret = tdb_store(tdb_ctx, key, data, -- TDB_REPLACE); -+ ret = do_tdb_write(conn, &key, &data, NULL, -+ true); - talloc_free(data.dptr); - } else { -- ret = tdb_delete(tdb_ctx, key); -+ ret = do_tdb_delete(conn, &key, NULL); - } - if (ret) - goto err; -@@ -425,7 +430,7 @@ static int finalize_transaction(struct connection *conn, - } - } - -- if (i->ta_node && tdb_delete(tdb_ctx, ta_key)) -+ if (i->ta_node && do_tdb_delete(conn, &ta_key, NULL)) - goto err; - list_del(&i->list); - talloc_free(i); -@@ -453,7 +458,7 @@ static int destroy_transaction(void *_transaction) - i->node); - if (trans_name) { - set_tdb_key(trans_name, &key); -- tdb_delete(tdb_ctx, key); -+ do_tdb_delete(trans->conn, &key, NULL); - } - } - list_del(&i->list); -@@ -497,6 +502,7 @@ int do_transaction_start(struct connection *conn, struct buffered_data *in) - - INIT_LIST_HEAD(&trans->accessed); - INIT_LIST_HEAD(&trans->changed_domains); -+ trans->conn = conn; - trans->fail = false; - trans->generation = ++generation; - --- -2.37.4 - diff --git a/0060-automation-Remove-non-debug-x86_32-build-jobs.patch b/0060-automation-Remove-non-debug-x86_32-build-jobs.patch new file mode 100644 index 0000000..c5516be --- /dev/null +++ b/0060-automation-Remove-non-debug-x86_32-build-jobs.patch @@ -0,0 +1,67 @@ +From 27974fde92850419e385ad0355997c54d78046f2 Mon Sep 17 00:00:00 2001 +From: Anthony PERARD <anthony.perard@citrix.com> +Date: Fri, 24 Feb 2023 17:29:15 +0000 +Subject: [PATCH 60/61] automation: Remove non-debug x86_32 build jobs + +In the interest of having less jobs, we remove the x86_32 build jobs +that do release build. Debug build is very likely to be enough to find +32bit build issues. + +Signed-off-by: Anthony PERARD <anthony.perard@citrix.com> +Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> +(cherry picked from commit 7b66792ea7f77fb9e587e1e9c530a7c869eecba1) +--- + automation/gitlab-ci/build.yaml | 20 -------------------- + 1 file changed, 20 deletions(-) + +diff --git a/automation/gitlab-ci/build.yaml b/automation/gitlab-ci/build.yaml +index bc1a732069..4b51ad9e34 100644 +--- a/automation/gitlab-ci/build.yaml ++++ b/automation/gitlab-ci/build.yaml +@@ -264,21 +264,11 @@ debian-stretch-gcc-debug: + variables: + CONTAINER: debian:stretch + +-debian-stretch-32-clang: +- extends: .clang-x86-32-build +- variables: +- CONTAINER: debian:stretch-i386 +- + debian-stretch-32-clang-debug: + extends: .clang-x86-32-build-debug + variables: + CONTAINER: debian:stretch-i386 + +-debian-stretch-32-gcc: +- extends: .gcc-x86-32-build +- variables: +- CONTAINER: debian:stretch-i386 +- + debian-stretch-32-gcc-debug: + extends: .gcc-x86-32-build-debug + variables: +@@ -316,21 +306,11 @@ debian-unstable-gcc-debug-randconfig: + CONTAINER: debian:unstable + RANDCONFIG: y + +-debian-unstable-32-clang: +- extends: .clang-x86-32-build +- variables: +- CONTAINER: debian:unstable-i386 +- + debian-unstable-32-clang-debug: + extends: .clang-x86-32-build-debug + variables: + CONTAINER: debian:unstable-i386 + +-debian-unstable-32-gcc: +- extends: .gcc-x86-32-build +- variables: +- CONTAINER: debian:unstable-i386 +- + debian-unstable-32-gcc-debug: + extends: .gcc-x86-32-build-debug + variables: +-- +2.40.0 + diff --git a/0060-tools-xenstore-add-exports-for-quota-variables.patch b/0060-tools-xenstore-add-exports-for-quota-variables.patch deleted file mode 100644 index 79ca465..0000000 --- a/0060-tools-xenstore-add-exports-for-quota-variables.patch +++ /dev/null @@ -1,62 +0,0 @@ -From 0a67b4eef104c36bef52990e413ef361acc8183c Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:10 +0200 -Subject: [PATCH 60/87] tools/xenstore: add exports for quota variables - -Some quota variables are not exported via header files. - -This is part of XSA-326. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Acked-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit 1da16d5990b5f7752657fca3e948f735177ea9ad) ---- - tools/xenstore/xenstored_core.h | 5 +++++ - tools/xenstore/xenstored_transaction.c | 1 - - tools/xenstore/xenstored_watch.c | 2 -- - 3 files changed, 5 insertions(+), 3 deletions(-) - -diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h -index da9ecce67f31..bfd3fc1e9df3 100644 ---- a/tools/xenstore/xenstored_core.h -+++ b/tools/xenstore/xenstored_core.h -@@ -275,6 +275,11 @@ extern TDB_CONTEXT *tdb_ctx; - extern int dom0_domid; - extern int dom0_event; - extern int priv_domid; -+extern int quota_nb_watch_per_domain; -+extern int quota_max_transaction; -+extern int quota_max_entry_size; -+extern int quota_nb_perms_per_node; -+extern int quota_max_path_len; - extern int quota_nb_entry_per_domain; - extern int quota_req_outstanding; - extern int quota_trans_nodes; -diff --git a/tools/xenstore/xenstored_transaction.c b/tools/xenstore/xenstored_transaction.c -index ace9a11d77bb..28774813de83 100644 ---- a/tools/xenstore/xenstored_transaction.c -+++ b/tools/xenstore/xenstored_transaction.c -@@ -175,7 +175,6 @@ struct transaction - bool fail; - }; - --extern int quota_max_transaction; - uint64_t generation; - - static struct accessed_node *find_accessed_node(struct transaction *trans, -diff --git a/tools/xenstore/xenstored_watch.c b/tools/xenstore/xenstored_watch.c -index fdf9b2d653a0..85362bcce314 100644 ---- a/tools/xenstore/xenstored_watch.c -+++ b/tools/xenstore/xenstored_watch.c -@@ -31,8 +31,6 @@ - #include "xenstored_domain.h" - #include "xenstored_transaction.h" - --extern int quota_nb_watch_per_domain; -- - struct watch - { - /* Watches on this connection */ --- -2.37.4 - diff --git a/0061-CI-Remove-llvm-8-from-the-Debian-Stretch-container.patch b/0061-CI-Remove-llvm-8-from-the-Debian-Stretch-container.patch new file mode 100644 index 0000000..9170382 --- /dev/null +++ b/0061-CI-Remove-llvm-8-from-the-Debian-Stretch-container.patch @@ -0,0 +1,103 @@ +From 31627a059c2e186f4ad12d171d964b09abe8a4a9 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper <andrew.cooper3@citrix.com> +Date: Fri, 24 Mar 2023 17:59:56 +0000 +Subject: [PATCH 61/61] CI: Remove llvm-8 from the Debian Stretch container + +For similar reasons to c/s a6b1e2b80fe20. While this container is still +build-able for now, all the other problems with explicitly-versioned compilers +remain. + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> +(cherry picked from commit 7a298375721636290a57f31bb0f7c2a5a38956a4) +--- + automation/build/debian/stretch-llvm-8.list | 3 --- + automation/build/debian/stretch.dockerfile | 12 --------- + automation/gitlab-ci/build.yaml | 27 --------------------- + 3 files changed, 42 deletions(-) + delete mode 100644 automation/build/debian/stretch-llvm-8.list + +diff --git a/automation/build/debian/stretch-llvm-8.list b/automation/build/debian/stretch-llvm-8.list +deleted file mode 100644 +index 09fe843fb2..0000000000 +--- a/automation/build/debian/stretch-llvm-8.list ++++ /dev/null +@@ -1,3 +0,0 @@ +-# Strech LLVM 8 repos +-deb http://apt.llvm.org/stretch/ llvm-toolchain-stretch-8 main +-deb-src http://apt.llvm.org/stretch/ llvm-toolchain-stretch-8 main +diff --git a/automation/build/debian/stretch.dockerfile b/automation/build/debian/stretch.dockerfile +index da6aa874dd..9861acbcc3 100644 +--- a/automation/build/debian/stretch.dockerfile ++++ b/automation/build/debian/stretch.dockerfile +@@ -53,15 +53,3 @@ RUN apt-get update && \ + apt-get autoremove -y && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists* /tmp/* /var/tmp/* +- +-RUN wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - +-COPY stretch-llvm-8.list /etc/apt/sources.list.d/ +- +-RUN apt-get update && \ +- apt-get --quiet --yes install \ +- clang-8 \ +- lld-8 \ +- && \ +- apt-get autoremove -y && \ +- apt-get clean && \ +- rm -rf /var/lib/apt/lists* /tmp/* /var/tmp/* +diff --git a/automation/gitlab-ci/build.yaml b/automation/gitlab-ci/build.yaml +index 4b51ad9e34..fd8034b429 100644 +--- a/automation/gitlab-ci/build.yaml ++++ b/automation/gitlab-ci/build.yaml +@@ -27,13 +27,6 @@ + CXX: clang++ + clang: y + +-.clang-8-tmpl: +- variables: &clang-8 +- CC: clang-8 +- CXX: clang++-8 +- LD: ld.lld-8 +- clang: y +- + .x86-64-build-tmpl: + <<: *build + variables: +@@ -98,16 +91,6 @@ + variables: + <<: *clang + +-.clang-8-x86-64-build: +- extends: .x86-64-build +- variables: +- <<: *clang-8 +- +-.clang-8-x86-64-build-debug: +- extends: .x86-64-build-debug +- variables: +- <<: *clang-8 +- + .clang-x86-32-build: + extends: .x86-32-build + variables: +@@ -244,16 +227,6 @@ debian-stretch-clang-debug: + variables: + CONTAINER: debian:stretch + +-debian-stretch-clang-8: +- extends: .clang-8-x86-64-build +- variables: +- CONTAINER: debian:stretch +- +-debian-stretch-clang-8-debug: +- extends: .clang-8-x86-64-build-debug +- variables: +- CONTAINER: debian:stretch +- + debian-stretch-gcc: + extends: .gcc-x86-64-build + variables: +-- +2.40.0 + diff --git a/0061-tools-xenstore-add-control-command-for-setting-and-s.patch b/0061-tools-xenstore-add-control-command-for-setting-and-s.patch deleted file mode 100644 index 5adcd35..0000000 --- a/0061-tools-xenstore-add-control-command-for-setting-and-s.patch +++ /dev/null @@ -1,248 +0,0 @@ -From b584b9b95687655f4f9f5c37fea3b1eea3f32886 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:10 +0200 -Subject: [PATCH 61/87] tools/xenstore: add control command for setting and - showing quota - -Add a xenstore-control command "quota" to: -- show current quota settings -- change quota settings -- show current quota related values of a domain - -Note that in the case the new quota is lower than existing one, -Xenstored may continue to handle requests from a domain exceeding the -new limit (depends on which one has been broken) and the amount of -resource used will not change. However the domain will not be able to -create more resource (associated to the quota) until it is back to below -the limit. - -This is part of XSA-326. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit 9c484bef83496b683b0087e3bd2a560da4aa37af) ---- - docs/misc/xenstore.txt | 11 +++ - tools/xenstore/xenstored_control.c | 111 +++++++++++++++++++++++++++++ - tools/xenstore/xenstored_domain.c | 33 +++++++++ - tools/xenstore/xenstored_domain.h | 2 + - 4 files changed, 157 insertions(+) - -diff --git a/docs/misc/xenstore.txt b/docs/misc/xenstore.txt -index 334dc8b6fdf5..a7d006519ae8 100644 ---- a/docs/misc/xenstore.txt -+++ b/docs/misc/xenstore.txt -@@ -366,6 +366,17 @@ CONTROL <command>|[<parameters>|] - print|<string> - print <string> to syslog (xenstore runs as daemon) or - to console (xenstore runs as stubdom) -+ quota|[set <name> <val>|<domid>] -+ without parameters: print the current quota settings -+ with "set <name> <val>": set the quota <name> to new value -+ <val> (The admin should make sure all the domain usage is -+ below the quota. If it is not, then Xenstored may continue to -+ handle requests from the domain as long as the resource -+ violating the new quota setting isn't increased further) -+ with "<domid>": print quota related accounting data for -+ the domain <domid> -+ quota-soft|[set <name> <val>] -+ like the "quota" command, but for soft-quota. - help <supported-commands> - return list of supported commands for CONTROL - -diff --git a/tools/xenstore/xenstored_control.c b/tools/xenstore/xenstored_control.c -index adb8d51b043b..1031a81c3874 100644 ---- a/tools/xenstore/xenstored_control.c -+++ b/tools/xenstore/xenstored_control.c -@@ -196,6 +196,115 @@ static int do_control_log(void *ctx, struct connection *conn, - return 0; - } - -+struct quota { -+ const char *name; -+ int *quota; -+ const char *descr; -+}; -+ -+static const struct quota hard_quotas[] = { -+ { "nodes", "a_nb_entry_per_domain, "Nodes per domain" }, -+ { "watches", "a_nb_watch_per_domain, "Watches per domain" }, -+ { "transactions", "a_max_transaction, "Transactions per domain" }, -+ { "outstanding", "a_req_outstanding, -+ "Outstanding requests per domain" }, -+ { "transaction-nodes", "a_trans_nodes, -+ "Max. number of accessed nodes per transaction" }, -+ { "memory", "a_memory_per_domain_hard, -+ "Total Xenstore memory per domain (error level)" }, -+ { "node-size", "a_max_entry_size, "Max. size of a node" }, -+ { "path-max", "a_max_path_len, "Max. length of a node path" }, -+ { "permissions", "a_nb_perms_per_node, -+ "Max. number of permissions per node" }, -+ { NULL, NULL, NULL } -+}; -+ -+static const struct quota soft_quotas[] = { -+ { "memory", "a_memory_per_domain_soft, -+ "Total Xenstore memory per domain (warning level)" }, -+ { NULL, NULL, NULL } -+}; -+ -+static int quota_show_current(const void *ctx, struct connection *conn, -+ const struct quota *quotas) -+{ -+ char *resp; -+ unsigned int i; -+ -+ resp = talloc_strdup(ctx, "Quota settings:\n"); -+ if (!resp) -+ return ENOMEM; -+ -+ for (i = 0; quotas[i].quota; i++) { -+ resp = talloc_asprintf_append(resp, "%-17s: %8d %s\n", -+ quotas[i].name, *quotas[i].quota, -+ quotas[i].descr); -+ if (!resp) -+ return ENOMEM; -+ } -+ -+ send_reply(conn, XS_CONTROL, resp, strlen(resp) + 1); -+ -+ return 0; -+} -+ -+static int quota_set(const void *ctx, struct connection *conn, -+ char **vec, int num, const struct quota *quotas) -+{ -+ unsigned int i; -+ int val; -+ -+ if (num != 2) -+ return EINVAL; -+ -+ val = atoi(vec[1]); -+ if (val < 1) -+ return EINVAL; -+ -+ for (i = 0; quotas[i].quota; i++) { -+ if (!strcmp(vec[0], quotas[i].name)) { -+ *quotas[i].quota = val; -+ send_ack(conn, XS_CONTROL); -+ return 0; -+ } -+ } -+ -+ return EINVAL; -+} -+ -+static int quota_get(const void *ctx, struct connection *conn, -+ char **vec, int num) -+{ -+ if (num != 1) -+ return EINVAL; -+ -+ return domain_get_quota(ctx, conn, atoi(vec[0])); -+} -+ -+static int do_control_quota(void *ctx, struct connection *conn, -+ char **vec, int num) -+{ -+ if (num == 0) -+ return quota_show_current(ctx, conn, hard_quotas); -+ -+ if (!strcmp(vec[0], "set")) -+ return quota_set(ctx, conn, vec + 1, num - 1, hard_quotas); -+ -+ return quota_get(ctx, conn, vec, num); -+} -+ -+static int do_control_quota_s(void *ctx, struct connection *conn, -+ char **vec, int num) -+{ -+ if (num == 0) -+ return quota_show_current(ctx, conn, soft_quotas); -+ -+ if (!strcmp(vec[0], "set")) -+ return quota_set(ctx, conn, vec + 1, num - 1, soft_quotas); -+ -+ return EINVAL; -+} -+ - #ifdef __MINIOS__ - static int do_control_memreport(void *ctx, struct connection *conn, - char **vec, int num) -@@ -847,6 +956,8 @@ static struct cmd_s cmds[] = { - { "memreport", do_control_memreport, "[<file>]" }, - #endif - { "print", do_control_print, "<string>" }, -+ { "quota", do_control_quota, "[set <name> <val>|<domid>]" }, -+ { "quota-soft", do_control_quota_s, "[set <name> <val>]" }, - { "help", do_control_help, "" }, - }; - -diff --git a/tools/xenstore/xenstored_domain.c b/tools/xenstore/xenstored_domain.c -index 94fd561e9de4..e7c6886ccf47 100644 ---- a/tools/xenstore/xenstored_domain.c -+++ b/tools/xenstore/xenstored_domain.c -@@ -31,6 +31,7 @@ - #include "xenstored_domain.h" - #include "xenstored_transaction.h" - #include "xenstored_watch.h" -+#include "xenstored_control.h" - - #include <xenevtchn.h> - #include <xenctrl.h> -@@ -345,6 +346,38 @@ static struct domain *find_domain_struct(unsigned int domid) - return NULL; - } - -+int domain_get_quota(const void *ctx, struct connection *conn, -+ unsigned int domid) -+{ -+ struct domain *d = find_domain_struct(domid); -+ char *resp; -+ int ta; -+ -+ if (!d) -+ return ENOENT; -+ -+ ta = d->conn ? d->conn->transaction_started : 0; -+ resp = talloc_asprintf(ctx, "Domain %u:\n", domid); -+ if (!resp) -+ return ENOMEM; -+ -+#define ent(t, e) \ -+ resp = talloc_asprintf_append(resp, "%-16s: %8d\n", #t, e); \ -+ if (!resp) return ENOMEM -+ -+ ent(nodes, d->nbentry); -+ ent(watches, d->nbwatch); -+ ent(transactions, ta); -+ ent(outstanding, d->nboutstanding); -+ ent(memory, d->memory); -+ -+#undef ent -+ -+ send_reply(conn, XS_CONTROL, resp, strlen(resp) + 1); -+ -+ return 0; -+} -+ - static struct domain *alloc_domain(const void *context, unsigned int domid) - { - struct domain *domain; -diff --git a/tools/xenstore/xenstored_domain.h b/tools/xenstore/xenstored_domain.h -index 633c9a0a0a1f..904faa923afb 100644 ---- a/tools/xenstore/xenstored_domain.h -+++ b/tools/xenstore/xenstored_domain.h -@@ -87,6 +87,8 @@ int domain_watch(struct connection *conn); - void domain_outstanding_inc(struct connection *conn); - void domain_outstanding_dec(struct connection *conn); - void domain_outstanding_domid_dec(unsigned int domid); -+int domain_get_quota(const void *ctx, struct connection *conn, -+ unsigned int domid); - - /* Special node permission handling. */ - int set_perms_special(struct connection *conn, const char *name, --- -2.37.4 - diff --git a/0062-tools-ocaml-xenstored-Synchronise-defaults-with-oxen.patch b/0062-tools-ocaml-xenstored-Synchronise-defaults-with-oxen.patch deleted file mode 100644 index b9f5b18..0000000 --- a/0062-tools-ocaml-xenstored-Synchronise-defaults-with-oxen.patch +++ /dev/null @@ -1,63 +0,0 @@ -From b0e95b451225de4db99bbe0b8dc79fdf08873e9e Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= <edvin.torok@citrix.com> -Date: Wed, 12 Oct 2022 19:13:01 +0100 -Subject: [PATCH 62/87] tools/ocaml/xenstored: Synchronise defaults with - oxenstore.conf.in -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -We currently have 2 different set of defaults in upstream Xen git tree: -* defined in the source code, only used if there is no config file -* defined in the oxenstored.conf.in upstream Xen - -An oxenstored.conf file is not mandatory, and if missing, maxrequests in -particular has an unsafe default. - -Resync the defaults from oxenstored.conf.in into the source code. - -This is part of XSA-326 / CVE-2022-42316. - -Signed-off-by: Edwin Török <edvin.torok@citrix.com> -Acked-by: Christian Lindig <christian.lindig@citrix.com> -(cherry picked from commit 84734955d4bf629ba459a74773afcde50a52236f) ---- - tools/ocaml/xenstored/define.ml | 6 +++--- - tools/ocaml/xenstored/quota.ml | 4 ++-- - 2 files changed, 5 insertions(+), 5 deletions(-) - -diff --git a/tools/ocaml/xenstored/define.ml b/tools/ocaml/xenstored/define.ml -index ebe18b8e312c..6b06f808595b 100644 ---- a/tools/ocaml/xenstored/define.ml -+++ b/tools/ocaml/xenstored/define.ml -@@ -21,9 +21,9 @@ let xs_daemon_socket = Paths.xen_run_stored ^ "/socket" - - let default_config_dir = Paths.xen_config_dir - --let maxwatch = ref (50) --let maxtransaction = ref (20) --let maxrequests = ref (-1) (* maximum requests per transaction *) -+let maxwatch = ref (100) -+let maxtransaction = ref (10) -+let maxrequests = ref (1024) (* maximum requests per transaction *) - - let conflict_burst_limit = ref 5.0 - let conflict_max_history_seconds = ref 0.05 -diff --git a/tools/ocaml/xenstored/quota.ml b/tools/ocaml/xenstored/quota.ml -index abcac912805a..6e3d6401ae89 100644 ---- a/tools/ocaml/xenstored/quota.ml -+++ b/tools/ocaml/xenstored/quota.ml -@@ -20,8 +20,8 @@ exception Transaction_opened - - let warn fmt = Logging.warn "quota" fmt - let activate = ref true --let maxent = ref (10000) --let maxsize = ref (4096) -+let maxent = ref (1000) -+let maxsize = ref (2048) - - type t = { - maxent: int; (* max entities per domU *) --- -2.37.4 - diff --git a/0063-tools-ocaml-xenstored-Check-for-maxrequests-before-p.patch b/0063-tools-ocaml-xenstored-Check-for-maxrequests-before-p.patch deleted file mode 100644 index 5b3b646..0000000 --- a/0063-tools-ocaml-xenstored-Check-for-maxrequests-before-p.patch +++ /dev/null @@ -1,101 +0,0 @@ -From ab21bb1971a7fa9308053b0686f43277f6e8a6c9 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= <edvin.torok@citrix.com> -Date: Thu, 28 Jul 2022 17:08:15 +0100 -Subject: [PATCH 63/87] tools/ocaml/xenstored: Check for maxrequests before - performing operations -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Previously we'd perform the operation, record the updated tree in the -transaction record, then try to insert a watchop path and the reply packet. - -If we exceeded max requests we would've returned EQUOTA, but still: -* have performed the operation on the transaction's tree -* have recorded the watchop, making this queue effectively unbounded - -It is better if we check whether we'd have room to store the operation before -performing the transaction, and raise EQUOTA there. Then the transaction -record won't grow. - -This is part of XSA-326 / CVE-2022-42317. - -Signed-off-by: Edwin Török <edvin.torok@citrix.com> -Acked-by: Christian Lindig <christian.lindig@citrix.com> -(cherry picked from commit 329f4d1a6535c6c5a34025ca0d03fc5c7228fcff) ---- - tools/ocaml/xenstored/process.ml | 4 +++- - tools/ocaml/xenstored/transaction.ml | 16 ++++++++++++---- - 2 files changed, 15 insertions(+), 5 deletions(-) - -diff --git a/tools/ocaml/xenstored/process.ml b/tools/ocaml/xenstored/process.ml -index 27790d4a5c41..dd58e6979cf9 100644 ---- a/tools/ocaml/xenstored/process.ml -+++ b/tools/ocaml/xenstored/process.ml -@@ -389,6 +389,7 @@ let input_handle_error ~cons ~doms ~fct ~con ~t ~req = - let reply_error e = - Packet.Error e in - try -+ Transaction.check_quota_exn ~perm:(Connection.get_perm con) t; - fct con t doms cons req.Packet.data - with - | Define.Invalid_path -> reply_error "EINVAL" -@@ -681,9 +682,10 @@ let process_packet ~store ~cons ~doms ~con ~req = - in - - let response = try -+ Transaction.check_quota_exn ~perm:(Connection.get_perm con) t; - if tid <> Transaction.none then - (* Remember the request and response for this operation in case we need to replay the transaction *) -- Transaction.add_operation ~perm:(Connection.get_perm con) t req response; -+ Transaction.add_operation t req response; - response - with Quota.Limit_reached -> - Packet.Error "EQUOTA" -diff --git a/tools/ocaml/xenstored/transaction.ml b/tools/ocaml/xenstored/transaction.ml -index 17b1bdf2eaf9..294143e2335b 100644 ---- a/tools/ocaml/xenstored/transaction.ml -+++ b/tools/ocaml/xenstored/transaction.ml -@@ -85,6 +85,7 @@ type t = { - oldroot: Store.Node.t; - mutable paths: (Xenbus.Xb.Op.operation * Store.Path.t) list; - mutable operations: (Packet.request * Packet.response) list; -+ mutable quota_reached: bool; - mutable read_lowpath: Store.Path.t option; - mutable write_lowpath: Store.Path.t option; - } -@@ -127,6 +128,7 @@ let make ?(internal=false) id store = - oldroot = Store.get_root store; - paths = []; - operations = []; -+ quota_reached = false; - read_lowpath = None; - write_lowpath = None; - } in -@@ -143,13 +145,19 @@ let get_root t = Store.get_root t.store - - let is_read_only t = t.paths = [] - let add_wop t ty path = t.paths <- (ty, path) :: t.paths --let add_operation ~perm t request response = -+let get_operations t = List.rev t.operations -+ -+let check_quota_exn ~perm t = - if !Define.maxrequests >= 0 - && not (Perms.Connection.is_dom0 perm) -- && List.length t.operations >= !Define.maxrequests -- then raise Quota.Limit_reached; -+ && (t.quota_reached || List.length t.operations >= !Define.maxrequests) -+ then begin -+ t.quota_reached <- true; -+ raise Quota.Limit_reached; -+ end -+ -+let add_operation t request response = - t.operations <- (request, response) :: t.operations --let get_operations t = List.rev t.operations - let set_read_lowpath t path = t.read_lowpath <- get_lowest path t.read_lowpath - let set_write_lowpath t path = t.write_lowpath <- get_lowest path t.write_lowpath - --- -2.37.4 - diff --git a/0064-tools-ocaml-GC-parameter-tuning.patch b/0064-tools-ocaml-GC-parameter-tuning.patch deleted file mode 100644 index 6c80e2d..0000000 --- a/0064-tools-ocaml-GC-parameter-tuning.patch +++ /dev/null @@ -1,126 +0,0 @@ -From a63bbcf5318b487ca86574d7fcf916958af5ed02 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= <edvin.torok@citrix.com> -Date: Wed, 12 Oct 2022 19:13:07 +0100 -Subject: [PATCH 64/87] tools/ocaml: GC parameter tuning -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -By default the OCaml garbage collector would return memory to the OS only -after unused memory is 5x live memory. Tweak this to 120% instead, which -would match the major GC speed. - -This is part of XSA-326. - -Signed-off-by: Edwin Török <edvin.torok@citrix.com> -Acked-by: Christian Lindig <christian.lindig@citrix.com> -(cherry picked from commit 4a8bacff20b857ca0d628ef5525877ade11f2a42) ---- - tools/ocaml/xenstored/define.ml | 1 + - tools/ocaml/xenstored/xenstored.ml | 64 ++++++++++++++++++++++++++++++ - 2 files changed, 65 insertions(+) - -diff --git a/tools/ocaml/xenstored/define.ml b/tools/ocaml/xenstored/define.ml -index 6b06f808595b..ba63a8147e09 100644 ---- a/tools/ocaml/xenstored/define.ml -+++ b/tools/ocaml/xenstored/define.ml -@@ -25,6 +25,7 @@ let maxwatch = ref (100) - let maxtransaction = ref (10) - let maxrequests = ref (1024) (* maximum requests per transaction *) - -+let gc_max_overhead = ref 120 (* 120% see comment in xenstored.ml *) - let conflict_burst_limit = ref 5.0 - let conflict_max_history_seconds = ref 0.05 - let conflict_rate_limit_is_aggregate = ref true -diff --git a/tools/ocaml/xenstored/xenstored.ml b/tools/ocaml/xenstored/xenstored.ml -index d44ae673c42a..3b57ad016dfb 100644 ---- a/tools/ocaml/xenstored/xenstored.ml -+++ b/tools/ocaml/xenstored/xenstored.ml -@@ -104,6 +104,7 @@ let parse_config filename = - ("quota-maxsize", Config.Set_int Quota.maxsize); - ("quota-maxrequests", Config.Set_int Define.maxrequests); - ("quota-path-max", Config.Set_int Define.path_max); -+ ("gc-max-overhead", Config.Set_int Define.gc_max_overhead); - ("test-eagain", Config.Set_bool Transaction.test_eagain); - ("persistent", Config.Set_bool Disk.enable); - ("xenstored-log-file", Config.String Logging.set_xenstored_log_destination); -@@ -265,6 +266,67 @@ let to_file store cons fds file = - (fun () -> close_out channel) - end - -+(* -+ By default OCaml's GC only returns memory to the OS when it exceeds a -+ configurable 'max overhead' setting. -+ The default is 500%, that is 5/6th of the OCaml heap needs to be free -+ and only 1/6th live for a compaction to be triggerred that would -+ release memory back to the OS. -+ If the limit is not hit then the OCaml process can reuse that memory -+ for its own purposes, but other processes won't be able to use it. -+ -+ There is also a 'space overhead' setting that controls how much work -+ each major GC slice does, and by default aims at having no more than -+ 80% or 120% (depending on version) garbage values compared to live -+ values. -+ This doesn't have as much relevance to memory returned to the OS as -+ long as space_overhead <= max_overhead, because compaction is only -+ triggerred at the end of major GC cycles. -+ -+ The defaults are too large once the program starts using ~100MiB of -+ memory, at which point ~500MiB would be unavailable to other processes -+ (which would be fine if this was the main process in this VM, but it is -+ not). -+ -+ Max overhead can also be set to 0, however this is for testing purposes -+ only (setting it lower than 'space overhead' wouldn't help because the -+ major GC wouldn't run fast enough, and compaction does have a -+ performance cost: we can only compact contiguous regions, so memory has -+ to be moved around). -+ -+ Max overhead controls how often the heap is compacted, which is useful -+ if there are burst of activity followed by long periods of idle state, -+ or if a domain quits, etc. Compaction returns memory to the OS. -+ -+ wasted = live * space_overhead / 100 -+ -+ For globally overriding the GC settings one can use OCAMLRUNPARAM, -+ however we provide a config file override to be consistent with other -+ oxenstored settings. -+ -+ One might want to dynamically adjust the overhead setting based on used -+ memory, i.e. to use a fixed upper bound in bytes, not percentage. However -+ measurements show that such adjustments increase GC overhead massively, -+ while still not guaranteeing that memory is returned any more quickly -+ than with a percentage based setting. -+ -+ The allocation policy could also be tweaked, e.g. first fit would reduce -+ fragmentation and thus memory usage, but the documentation warns that it -+ can be sensibly slower, and indeed one of our own testcases can trigger -+ such a corner case where it is multiple times slower, so it is best to keep -+ the default allocation policy (next-fit/best-fit depending on version). -+ -+ There are other tweaks that can be attempted in the future, e.g. setting -+ 'ulimit -v' to 75% of RAM, however getting the kernel to actually return -+ NULL from allocations is difficult even with that setting, and without a -+ NULL the emergency GC won't be triggerred. -+ Perhaps cgroup limits could help, but for now tweak the safest only. -+*) -+ -+let tweak_gc () = -+ Gc.set { (Gc.get ()) with Gc.max_overhead = !Define.gc_max_overhead } -+ -+ - let _ = - let cf = do_argv in - let pidfile = -@@ -274,6 +336,8 @@ let _ = - default_pidfile - in - -+ tweak_gc (); -+ - (try - Unixext.mkdir_rec (Filename.dirname pidfile) 0o755 - with _ -> --- -2.37.4 - diff --git a/0065-tools-ocaml-libs-xb-hide-type-of-Xb.t.patch b/0065-tools-ocaml-libs-xb-hide-type-of-Xb.t.patch deleted file mode 100644 index 4c1bcbe..0000000 --- a/0065-tools-ocaml-libs-xb-hide-type-of-Xb.t.patch +++ /dev/null @@ -1,92 +0,0 @@ -From 8b60ad49b46f2e020e0f0847df80c768d669cdb2 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= <edvin.torok@citrix.com> -Date: Fri, 29 Jul 2022 18:53:29 +0100 -Subject: [PATCH 65/87] tools/ocaml/libs/xb: hide type of Xb.t -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Hiding the type will make it easier to change the implementation -in the future without breaking code that relies on it. - -No functional change. - -Signed-off-by: Edwin Török <edvin.torok@citrix.com> -Acked-by: Christian Lindig <christian.lindig@citrix.com> -(cherry picked from commit 7ade30a1451734d041363c750a65d322e25b47ba) ---- - tools/ocaml/libs/xb/xb.ml | 3 +++ - tools/ocaml/libs/xb/xb.mli | 9 ++------- - tools/ocaml/xenstored/connection.ml | 8 ++------ - 3 files changed, 7 insertions(+), 13 deletions(-) - -diff --git a/tools/ocaml/libs/xb/xb.ml b/tools/ocaml/libs/xb/xb.ml -index 104d319d7747..8404ddd8a682 100644 ---- a/tools/ocaml/libs/xb/xb.ml -+++ b/tools/ocaml/libs/xb/xb.ml -@@ -196,6 +196,9 @@ let peek_output con = Queue.peek con.pkt_out - let input_len con = Queue.length con.pkt_in - let has_in_packet con = Queue.length con.pkt_in > 0 - let get_in_packet con = Queue.pop con.pkt_in -+let has_partial_input con = match con.partial_in with -+ | HaveHdr _ -> true -+ | NoHdr (n, _) -> n < Partial.header_size () - let has_more_input con = - match con.backend with - | Fd _ -> false -diff --git a/tools/ocaml/libs/xb/xb.mli b/tools/ocaml/libs/xb/xb.mli -index 3a00da6cddc1..794e35bb343e 100644 ---- a/tools/ocaml/libs/xb/xb.mli -+++ b/tools/ocaml/libs/xb/xb.mli -@@ -66,13 +66,7 @@ type backend_mmap = { - type backend_fd = { fd : Unix.file_descr; } - type backend = Fd of backend_fd | Xenmmap of backend_mmap - type partial_buf = HaveHdr of Partial.pkt | NoHdr of int * bytes --type t = { -- backend : backend; -- pkt_in : Packet.t Queue.t; -- pkt_out : Packet.t Queue.t; -- mutable partial_in : partial_buf; -- mutable partial_out : string; --} -+type t - val init_partial_in : unit -> partial_buf - val reconnect : t -> unit - val queue : t -> Packet.t -> unit -@@ -97,6 +91,7 @@ val has_output : t -> bool - val peek_output : t -> Packet.t - val input_len : t -> int - val has_in_packet : t -> bool -+val has_partial_input : t -> bool - val get_in_packet : t -> Packet.t - val has_more_input : t -> bool - val is_selectable : t -> bool -diff --git a/tools/ocaml/xenstored/connection.ml b/tools/ocaml/xenstored/connection.ml -index 65f99ea6f28a..38b47363a173 100644 ---- a/tools/ocaml/xenstored/connection.ml -+++ b/tools/ocaml/xenstored/connection.ml -@@ -125,9 +125,7 @@ let get_perm con = - let set_target con target_domid = - con.perm <- Perms.Connection.set_target (get_perm con) ~perms:[Perms.READ; Perms.WRITE] target_domid - --let is_backend_mmap con = match con.xb.Xenbus.Xb.backend with -- | Xenbus.Xb.Xenmmap _ -> true -- | _ -> false -+let is_backend_mmap con = Xenbus.Xb.is_mmap con.xb - - let send_reply con tid rid ty data = - if (String.length data) > xenstore_payload_max && (is_backend_mmap con) then -@@ -280,9 +278,7 @@ let get_transaction con tid = - - let do_input con = Xenbus.Xb.input con.xb - let has_input con = Xenbus.Xb.has_in_packet con.xb --let has_partial_input con = match con.xb.Xenbus.Xb.partial_in with -- | HaveHdr _ -> true -- | NoHdr (n, _) -> n < Xenbus.Partial.header_size () -+let has_partial_input con = Xenbus.Xb.has_partial_input con.xb - let pop_in con = Xenbus.Xb.get_in_packet con.xb - let has_more_input con = Xenbus.Xb.has_more_input con.xb - --- -2.37.4 - diff --git a/0066-tools-ocaml-Change-Xb.input-to-return-Packet.t-optio.patch b/0066-tools-ocaml-Change-Xb.input-to-return-Packet.t-optio.patch deleted file mode 100644 index 0fa056d..0000000 --- a/0066-tools-ocaml-Change-Xb.input-to-return-Packet.t-optio.patch +++ /dev/null @@ -1,224 +0,0 @@ -From 59981b08c8ef6eed37b1171656c2a5f3b4b74012 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= <edvin.torok@citrix.com> -Date: Wed, 12 Oct 2022 19:13:02 +0100 -Subject: [PATCH 66/87] tools/ocaml: Change Xb.input to return Packet.t option -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -The queue here would only ever hold at most one element. This will simplify -follow-up patches. - -This is part of XSA-326. - -Signed-off-by: Edwin Török <edvin.torok@citrix.com> -Acked-by: Christian Lindig <christian.lindig@citrix.com> -(cherry picked from commit c0a86a462721008eca5ff733660de094d3c34bc7) ---- - tools/ocaml/libs/xb/xb.ml | 18 +++++------------- - tools/ocaml/libs/xb/xb.mli | 5 +---- - tools/ocaml/libs/xs/xsraw.ml | 20 ++++++-------------- - tools/ocaml/xenstored/connection.ml | 4 +--- - tools/ocaml/xenstored/process.ml | 15 +++++++-------- - 5 files changed, 20 insertions(+), 42 deletions(-) - -diff --git a/tools/ocaml/libs/xb/xb.ml b/tools/ocaml/libs/xb/xb.ml -index 8404ddd8a682..165fd4a1edf4 100644 ---- a/tools/ocaml/libs/xb/xb.ml -+++ b/tools/ocaml/libs/xb/xb.ml -@@ -45,7 +45,6 @@ type partial_buf = HaveHdr of Partial.pkt | NoHdr of int * bytes - type t = - { - backend: backend; -- pkt_in: Packet.t Queue.t; - pkt_out: Packet.t Queue.t; - mutable partial_in: partial_buf; - mutable partial_out: string; -@@ -62,7 +61,6 @@ let reconnect t = match t.backend with - Xs_ring.close backend.mmap; - backend.eventchn_notify (); - (* Clear our old connection state *) -- Queue.clear t.pkt_in; - Queue.clear t.pkt_out; - t.partial_in <- init_partial_in (); - t.partial_out <- "" -@@ -124,7 +122,6 @@ let output con = - - (* NB: can throw Reconnect *) - let input con = -- let newpacket = ref false in - let to_read = - match con.partial_in with - | HaveHdr partial_pkt -> Partial.to_complete partial_pkt -@@ -143,21 +140,19 @@ let input con = - if Partial.to_complete partial_pkt = 0 then ( - let pkt = Packet.of_partialpkt partial_pkt in - con.partial_in <- init_partial_in (); -- Queue.push pkt con.pkt_in; -- newpacket := true -- ) -+ Some pkt -+ ) else None - | NoHdr (i, buf) -> - (* we complete the partial header *) - if sz > 0 then - Bytes.blit b 0 buf (Partial.header_size () - i) sz; - con.partial_in <- if sz = i then -- HaveHdr (Partial.of_string (Bytes.to_string buf)) else NoHdr (i - sz, buf) -- ); -- !newpacket -+ HaveHdr (Partial.of_string (Bytes.to_string buf)) else NoHdr (i - sz, buf); -+ None -+ ) - - let newcon backend = { - backend = backend; -- pkt_in = Queue.create (); - pkt_out = Queue.create (); - partial_in = init_partial_in (); - partial_out = ""; -@@ -193,9 +188,6 @@ let has_output con = has_new_output con || has_old_output con - - let peek_output con = Queue.peek con.pkt_out - --let input_len con = Queue.length con.pkt_in --let has_in_packet con = Queue.length con.pkt_in > 0 --let get_in_packet con = Queue.pop con.pkt_in - let has_partial_input con = match con.partial_in with - | HaveHdr _ -> true - | NoHdr (n, _) -> n < Partial.header_size () -diff --git a/tools/ocaml/libs/xb/xb.mli b/tools/ocaml/libs/xb/xb.mli -index 794e35bb343e..91c682162cea 100644 ---- a/tools/ocaml/libs/xb/xb.mli -+++ b/tools/ocaml/libs/xb/xb.mli -@@ -77,7 +77,7 @@ val write_fd : backend_fd -> 'a -> string -> int -> int - val write_mmap : backend_mmap -> 'a -> string -> int -> int - val write : t -> string -> int -> int - val output : t -> bool --val input : t -> bool -+val input : t -> Packet.t option - val newcon : backend -> t - val open_fd : Unix.file_descr -> t - val open_mmap : Xenmmap.mmap_interface -> (unit -> unit) -> t -@@ -89,10 +89,7 @@ val has_new_output : t -> bool - val has_old_output : t -> bool - val has_output : t -> bool - val peek_output : t -> Packet.t --val input_len : t -> int --val has_in_packet : t -> bool - val has_partial_input : t -> bool --val get_in_packet : t -> Packet.t - val has_more_input : t -> bool - val is_selectable : t -> bool - val get_fd : t -> Unix.file_descr -diff --git a/tools/ocaml/libs/xs/xsraw.ml b/tools/ocaml/libs/xs/xsraw.ml -index d982fb24dbb1..451f8b38dbcc 100644 ---- a/tools/ocaml/libs/xs/xsraw.ml -+++ b/tools/ocaml/libs/xs/xsraw.ml -@@ -94,26 +94,18 @@ let pkt_send con = - done - - (* receive one packet - can sleep *) --let pkt_recv con = -- let workdone = ref false in -- while not !workdone -- do -- workdone := Xb.input con.xb -- done; -- Xb.get_in_packet con.xb -+let rec pkt_recv con = -+ match Xb.input con.xb with -+ | Some packet -> packet -+ | None -> pkt_recv con - - let pkt_recv_timeout con timeout = - let fd = Xb.get_fd con.xb in - let r, _, _ = Unix.select [ fd ] [] [] timeout in - if r = [] then - true, None -- else ( -- let workdone = Xb.input con.xb in -- if workdone then -- false, (Some (Xb.get_in_packet con.xb)) -- else -- false, None -- ) -+ else -+ false, Xb.input con.xb - - let queue_watchevent con data = - let ls = split_string ~limit:2 '\000' data in -diff --git a/tools/ocaml/xenstored/connection.ml b/tools/ocaml/xenstored/connection.ml -index 38b47363a173..cc20e047d2b9 100644 ---- a/tools/ocaml/xenstored/connection.ml -+++ b/tools/ocaml/xenstored/connection.ml -@@ -277,9 +277,7 @@ let get_transaction con tid = - Hashtbl.find con.transactions tid - - let do_input con = Xenbus.Xb.input con.xb --let has_input con = Xenbus.Xb.has_in_packet con.xb - let has_partial_input con = Xenbus.Xb.has_partial_input con.xb --let pop_in con = Xenbus.Xb.get_in_packet con.xb - let has_more_input con = Xenbus.Xb.has_more_input con.xb - - let has_output con = Xenbus.Xb.has_output con.xb -@@ -307,7 +305,7 @@ let is_bad con = match con.dom with None -> false | Some dom -> Domain.is_bad_do - Restrictions below can be relaxed once xenstored learns to dump more - of its live state in a safe way *) - let has_extra_connection_data con = -- let has_in = has_input con || has_partial_input con in -+ let has_in = has_partial_input con in - let has_out = has_output con in - let has_socket = con.dom = None in - let has_nondefault_perms = make_perm con.dom <> con.perm in -diff --git a/tools/ocaml/xenstored/process.ml b/tools/ocaml/xenstored/process.ml -index dd58e6979cf9..cbf708213796 100644 ---- a/tools/ocaml/xenstored/process.ml -+++ b/tools/ocaml/xenstored/process.ml -@@ -195,10 +195,9 @@ let parse_live_update args = - | _ when Unix.gettimeofday () < t.deadline -> false - | l -> - warn "timeout reached: have to wait, migrate or shutdown %d domains:" (List.length l); -- let msgs = List.rev_map (fun con -> Printf.sprintf "%s: %d tx, in: %b, out: %b, perm: %s" -+ let msgs = List.rev_map (fun con -> Printf.sprintf "%s: %d tx, out: %b, perm: %s" - (Connection.get_domstr con) - (Connection.number_of_transactions con) -- (Connection.has_input con) - (Connection.has_output con) - (Connection.get_perm con |> Perms.Connection.to_string) - ) l in -@@ -706,16 +705,17 @@ let do_input store cons doms con = - info "%s requests a reconnect" (Connection.get_domstr con); - History.reconnect con; - info "%s reconnection complete" (Connection.get_domstr con); -- false -+ None - | Failure exp -> - error "caught exception %s" exp; - error "got a bad client %s" (sprintf "%-8s" (Connection.get_domstr con)); - Connection.mark_as_bad con; -- false -+ None - in - -- if newpacket then ( -- let packet = Connection.pop_in con in -+ match newpacket with -+ | None -> () -+ | Some packet -> - let tid, rid, ty, data = Xenbus.Xb.Packet.unpack packet in - let req = {Packet.tid=tid; Packet.rid=rid; Packet.ty=ty; Packet.data=data} in - -@@ -725,8 +725,7 @@ let do_input store cons doms con = - (Xenbus.Xb.Op.to_string ty) (sanitize_data data); *) - process_packet ~store ~cons ~doms ~con ~req; - write_access_log ~ty ~tid ~con:(Connection.get_domstr con) ~data; -- Connection.incr_ops con; -- ) -+ Connection.incr_ops con - - let do_output _store _cons _doms con = - if Connection.has_output con then ( --- -2.37.4 - diff --git a/0067-tools-ocaml-xb-Add-BoundedQueue.patch b/0067-tools-ocaml-xb-Add-BoundedQueue.patch deleted file mode 100644 index 9a141a3..0000000 --- a/0067-tools-ocaml-xb-Add-BoundedQueue.patch +++ /dev/null @@ -1,133 +0,0 @@ -From ea1567893b05df03fe65657f0a25211a6a9ff7ec Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= <edvin.torok@citrix.com> -Date: Wed, 12 Oct 2022 19:13:03 +0100 -Subject: [PATCH 67/87] tools/ocaml/xb: Add BoundedQueue -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Ensures we cannot store more than [capacity] elements in a [Queue]. Replacing -all Queue with this module will then ensure at compile time that all Queues -are correctly bound checked. - -Each element in the queue has a class with its own limits. This, in a -subsequent change, will ensure that command responses can proceed during a -flood of watch events. - -No functional change. - -This is part of XSA-326. - -Signed-off-by: Edwin Török <edvin.torok@citrix.com> -Acked-by: Christian Lindig <christian.lindig@citrix.com> -(cherry picked from commit 19171fb5d888b4467a7073e8febc5e05540956e9) ---- - tools/ocaml/libs/xb/xb.ml | 92 +++++++++++++++++++++++++++++++++++++++ - 1 file changed, 92 insertions(+) - -diff --git a/tools/ocaml/libs/xb/xb.ml b/tools/ocaml/libs/xb/xb.ml -index 165fd4a1edf4..4197a3888a68 100644 ---- a/tools/ocaml/libs/xb/xb.ml -+++ b/tools/ocaml/libs/xb/xb.ml -@@ -17,6 +17,98 @@ - module Op = struct include Op end - module Packet = struct include Packet end - -+module BoundedQueue : sig -+ type ('a, 'b) t -+ -+ (** [create ~capacity ~classify ~limit] creates a queue with maximum [capacity] elements. -+ This is burst capacity, each element is further classified according to [classify], -+ and each class can have its own [limit]. -+ [capacity] is enforced as an overall limit. -+ The [limit] can be dynamic, and can be smaller than the number of elements already queued of that class, -+ in which case those elements are considered to use "burst capacity". -+ *) -+ val create: capacity:int -> classify:('a -> 'b) -> limit:('b -> int) -> ('a, 'b) t -+ -+ (** [clear q] discards all elements from [q] *) -+ val clear: ('a, 'b) t -> unit -+ -+ (** [can_push q] when [length q < capacity]. *) -+ val can_push: ('a, 'b) t -> 'b -> bool -+ -+ (** [push e q] adds [e] at the end of queue [q] if [can_push q], or returns [None]. *) -+ val push: 'a -> ('a, 'b) t -> unit option -+ -+ (** [pop q] removes and returns first element in [q], or raises [Queue.Empty]. *) -+ val pop: ('a, 'b) t -> 'a -+ -+ (** [peek q] returns the first element in [q], or raises [Queue.Empty]. *) -+ val peek : ('a, 'b) t -> 'a -+ -+ (** [length q] returns the current number of elements in [q] *) -+ val length: ('a, 'b) t -> int -+ -+ (** [debug string_of_class q] prints queue usage statistics in an unspecified internal format. *) -+ val debug: ('b -> string) -> (_, 'b) t -> string -+end = struct -+ type ('a, 'b) t = -+ { q: 'a Queue.t -+ ; capacity: int -+ ; classify: 'a -> 'b -+ ; limit: 'b -> int -+ ; class_count: ('b, int) Hashtbl.t -+ } -+ -+ let create ~capacity ~classify ~limit = -+ { capacity; q = Queue.create (); classify; limit; class_count = Hashtbl.create 3 } -+ -+ let get_count t classification = try Hashtbl.find t.class_count classification with Not_found -> 0 -+ -+ let can_push_internal t classification class_count = -+ Queue.length t.q < t.capacity && class_count < t.limit classification -+ -+ let ok = Some () -+ -+ let push e t = -+ let classification = t.classify e in -+ let class_count = get_count t classification in -+ if can_push_internal t classification class_count then begin -+ Queue.push e t.q; -+ Hashtbl.replace t.class_count classification (class_count + 1); -+ ok -+ end -+ else -+ None -+ -+ let can_push t classification = -+ can_push_internal t classification @@ get_count t classification -+ -+ let clear t = -+ Queue.clear t.q; -+ Hashtbl.reset t.class_count -+ -+ let pop t = -+ let e = Queue.pop t.q in -+ let classification = t.classify e in -+ let () = match get_count t classification - 1 with -+ | 0 -> Hashtbl.remove t.class_count classification (* reduces memusage *) -+ | n -> Hashtbl.replace t.class_count classification n -+ in -+ e -+ -+ let peek t = Queue.peek t.q -+ let length t = Queue.length t.q -+ -+ let debug string_of_class t = -+ let b = Buffer.create 128 in -+ Printf.bprintf b "BoundedQueue capacity: %d, used: {" t.capacity; -+ Hashtbl.iter (fun packet_class count -> -+ Printf.bprintf b " %s: %d" (string_of_class packet_class) count -+ ) t.class_count; -+ Printf.bprintf b "}"; -+ Buffer.contents b -+end -+ -+ - exception End_of_file - exception Eagain - exception Noent --- -2.37.4 - diff --git a/0068-tools-ocaml-Limit-maximum-in-flight-requests-outstan.patch b/0068-tools-ocaml-Limit-maximum-in-flight-requests-outstan.patch deleted file mode 100644 index 0572fa1..0000000 --- a/0068-tools-ocaml-Limit-maximum-in-flight-requests-outstan.patch +++ /dev/null @@ -1,888 +0,0 @@ -From cec3c52c287f5aee7de061b40765aca5301cf9ca Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= <edvin.torok@citrix.com> -Date: Wed, 12 Oct 2022 19:13:04 +0100 -Subject: [PATCH 68/87] tools/ocaml: Limit maximum in-flight requests / - outstanding replies -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Introduce a limit on the number of outstanding reply packets in the xenbus -queue. This limits the number of in-flight requests: when the output queue is -full we'll stop processing inputs until the output queue has room again. - -To avoid a busy loop on the Unix socket we only add it to the watched input -file descriptor set if we'd be able to call `input` on it. Even though Dom0 -is trusted and exempt from quotas a flood of events might cause a backlog -where events are produced faster than daemons in Dom0 can consume them, which -could lead to an unbounded queue size and OOM. - -Therefore the xenbus queue limit must apply to all connections, Dom0 is not -exempt from it, although if everything works correctly it will eventually -catch up. - -This prevents a malicious guest from sending more commands while it has -outstanding watch events or command replies in its input ring. However if it -can cause the generation of watch events by other means (e.g. by Dom0, or -another cooperative guest) and stop reading its own ring then watch events -would've queued up without limit. - -The xenstore protocol doesn't have a back-pressure mechanism, and doesn't -allow dropping watch events. In fact, dropping watch events is known to break -some pieces of normal functionality. This leaves little choice to safely -implement the xenstore protocol without exposing the xenstore daemon to -out-of-memory attacks. - -Implement the fix as pipes with bounded buffers: -* Use a bounded buffer for watch events -* The watch structure will have a bounded receiving pipe of watch events -* The source will have an "overflow" pipe of pending watch events it couldn't - deliver - -Items are queued up on one end and are sent as far along the pipe as possible: - - source domain -> watch -> xenbus of target -> xenstore ring/socket of target - -If the pipe is "full" at any point then back-pressure is applied and we prevent -more items from being queued up. For the source domain this means that we'll -stop accepting new commands as long as its pipe buffer is not empty. - -Before we try to enqueue an item we first check whether it is possible to send -it further down the pipe, by attempting to recursively flush the pipes. This -ensures that we retain the order of events as much as possible. - -We might break causality of watch events if the target domain's queue is full -and we need to start using the watch's queue. This is a breaking change in -the xenstore protocol, but only for domains which are not processing their -incoming ring as expected. - -When a watch is deleted its entire pending queue is dropped (no code is needed -for that, because it is part of the 'watch' type). - -There is a cache of watches that have pending events that we attempt to flush -at every cycle if possible. - -Introduce 3 limits here: -* quota-maxwatchevents on watch event destination: when this is hit the - source will not be allowed to queue up more watch events. -* quota-maxoustanding which is the number of responses not read from the ring: - once exceeded, no more inputs are processed until all outstanding replies - are consumed by the client. -* overflow queue on the watch event source: all watches that cannot be stored - on destination are queued up here, a single command can trigger multiple - watches (e.g. due to recursion). - -The overflow queue currently doesn't have an upper bound, it is difficult to -accurately calculate one as it depends on whether you are Dom0 and how many -watches each path has registered and how many watch events you can trigger -with a single command (e.g. a commit). However these events were already -using memory, this just moves them elsewhere, and as long as we correctly -block a domain it shouldn't result in unbounded memory usage. - -Note that Dom0 is not excluded from these checks, it is important that Dom0 is -especially not excluded when it is the source, since there are many ways in -which a guest could trigger Dom0 to send it watch events. - -This should protect against malicious frontends as long as the backend follows -the PV xenstore protocol and only exposes paths needed by the frontend, and -changes those paths at most once as a reaction to guest events, or protocol -state. - -The queue limits are per watch, and per domain-pair, so even if one -communication channel would be "blocked", others would keep working, and the -domain itself won't get blocked as long as it doesn't overflow the queue of -watch events. - -Similarly a malicious backend could cause the frontend to get blocked, but -this watch queue protects the frontend as well as long as it follows the PV -protocol. (Although note that protection against malicious backends is only a -best effort at the moment) - -This is part of XSA-326 / CVE-2022-42318. - -Signed-off-by: Edwin Török <edvin.torok@citrix.com> -Acked-by: Christian Lindig <christian.lindig@citrix.com> -(cherry picked from commit 9284ae0c40fb5b9606947eaaec23dc71d0540e96) ---- - tools/ocaml/libs/xb/xb.ml | 61 +++++++-- - tools/ocaml/libs/xb/xb.mli | 11 +- - tools/ocaml/libs/xs/queueop.ml | 25 ++-- - tools/ocaml/libs/xs/xsraw.ml | 4 +- - tools/ocaml/xenstored/connection.ml | 155 +++++++++++++++++++++-- - tools/ocaml/xenstored/connections.ml | 57 +++++++-- - tools/ocaml/xenstored/define.ml | 7 + - tools/ocaml/xenstored/oxenstored.conf.in | 2 + - tools/ocaml/xenstored/process.ml | 31 ++++- - tools/ocaml/xenstored/xenstored.ml | 2 + - 10 files changed, 296 insertions(+), 59 deletions(-) - -diff --git a/tools/ocaml/libs/xb/xb.ml b/tools/ocaml/libs/xb/xb.ml -index 4197a3888a68..b292ed7a874d 100644 ---- a/tools/ocaml/libs/xb/xb.ml -+++ b/tools/ocaml/libs/xb/xb.ml -@@ -134,14 +134,44 @@ type backend = Fd of backend_fd | Xenmmap of backend_mmap - - type partial_buf = HaveHdr of Partial.pkt | NoHdr of int * bytes - -+(* -+ separate capacity reservation for replies and watch events: -+ this allows a domain to keep working even when under a constant flood of -+ watch events -+*) -+type capacity = { maxoutstanding: int; maxwatchevents: int } -+ -+module Queue = BoundedQueue -+ -+type packet_class = -+ | CommandReply -+ | Watchevent -+ -+let string_of_packet_class = function -+ | CommandReply -> "command_reply" -+ | Watchevent -> "watch_event" -+ - type t = - { - backend: backend; -- pkt_out: Packet.t Queue.t; -+ pkt_out: (Packet.t, packet_class) Queue.t; - mutable partial_in: partial_buf; - mutable partial_out: string; -+ capacity: capacity - } - -+let to_read con = -+ match con.partial_in with -+ | HaveHdr partial_pkt -> Partial.to_complete partial_pkt -+ | NoHdr (i, _) -> i -+ -+let debug t = -+ Printf.sprintf "XenBus state: partial_in: %d needed, partial_out: %d bytes, pkt_out: %d packets, %s" -+ (to_read t) -+ (String.length t.partial_out) -+ (Queue.length t.pkt_out) -+ (BoundedQueue.debug string_of_packet_class t.pkt_out) -+ - let init_partial_in () = NoHdr - (Partial.header_size (), Bytes.make (Partial.header_size()) '\000') - -@@ -199,7 +229,8 @@ let output con = - let s = if String.length con.partial_out > 0 then - con.partial_out - else if Queue.length con.pkt_out > 0 then -- Packet.to_string (Queue.pop con.pkt_out) -+ let pkt = Queue.pop con.pkt_out in -+ Packet.to_string pkt - else - "" in - (* send data from s, and save the unsent data to partial_out *) -@@ -212,12 +243,15 @@ let output con = - (* after sending one packet, partial is empty *) - con.partial_out = "" - -+(* we can only process an input packet if we're guaranteed to have room -+ to store the response packet *) -+let can_input con = Queue.can_push con.pkt_out CommandReply -+ - (* NB: can throw Reconnect *) - let input con = -- let to_read = -- match con.partial_in with -- | HaveHdr partial_pkt -> Partial.to_complete partial_pkt -- | NoHdr (i, _) -> i in -+ if not (can_input con) then None -+ else -+ let to_read = to_read con in - - (* try to get more data from input stream *) - let b = Bytes.make to_read '\000' in -@@ -243,11 +277,22 @@ let input con = - None - ) - --let newcon backend = { -+let classify t = -+ match t.Packet.ty with -+ | Op.Watchevent -> Watchevent -+ | _ -> CommandReply -+ -+let newcon ~capacity backend = -+ let limit = function -+ | CommandReply -> capacity.maxoutstanding -+ | Watchevent -> capacity.maxwatchevents -+ in -+ { - backend = backend; -- pkt_out = Queue.create (); -+ pkt_out = Queue.create ~capacity:(capacity.maxoutstanding + capacity.maxwatchevents) ~classify ~limit; - partial_in = init_partial_in (); - partial_out = ""; -+ capacity = capacity; - } - - let open_fd fd = newcon (Fd { fd = fd; }) -diff --git a/tools/ocaml/libs/xb/xb.mli b/tools/ocaml/libs/xb/xb.mli -index 91c682162cea..71b2754ca788 100644 ---- a/tools/ocaml/libs/xb/xb.mli -+++ b/tools/ocaml/libs/xb/xb.mli -@@ -66,10 +66,11 @@ type backend_mmap = { - type backend_fd = { fd : Unix.file_descr; } - type backend = Fd of backend_fd | Xenmmap of backend_mmap - type partial_buf = HaveHdr of Partial.pkt | NoHdr of int * bytes -+type capacity = { maxoutstanding: int; maxwatchevents: int } - type t - val init_partial_in : unit -> partial_buf - val reconnect : t -> unit --val queue : t -> Packet.t -> unit -+val queue : t -> Packet.t -> unit option - val read_fd : backend_fd -> 'a -> bytes -> int -> int - val read_mmap : backend_mmap -> 'a -> bytes -> int -> int - val read : t -> bytes -> int -> int -@@ -78,13 +79,14 @@ val write_mmap : backend_mmap -> 'a -> string -> int -> int - val write : t -> string -> int -> int - val output : t -> bool - val input : t -> Packet.t option --val newcon : backend -> t --val open_fd : Unix.file_descr -> t --val open_mmap : Xenmmap.mmap_interface -> (unit -> unit) -> t -+val newcon : capacity:capacity -> backend -> t -+val open_fd : Unix.file_descr -> capacity:capacity -> t -+val open_mmap : Xenmmap.mmap_interface -> (unit -> unit) -> capacity:capacity -> t - val close : t -> unit - val is_fd : t -> bool - val is_mmap : t -> bool - val output_len : t -> int -+val can_input: t -> bool - val has_new_output : t -> bool - val has_old_output : t -> bool - val has_output : t -> bool -@@ -93,3 +95,4 @@ val has_partial_input : t -> bool - val has_more_input : t -> bool - val is_selectable : t -> bool - val get_fd : t -> Unix.file_descr -+val debug: t -> string -diff --git a/tools/ocaml/libs/xs/queueop.ml b/tools/ocaml/libs/xs/queueop.ml -index 9ff5bbd529ce..4e532cdaeacb 100644 ---- a/tools/ocaml/libs/xs/queueop.ml -+++ b/tools/ocaml/libs/xs/queueop.ml -@@ -16,9 +16,10 @@ - open Xenbus - - let data_concat ls = (String.concat "\000" ls) ^ "\000" -+let queue con pkt = let r = Xb.queue con pkt in assert (r <> None) - let queue_path ty (tid: int) (path: string) con = - let data = data_concat [ path; ] in -- Xb.queue con (Xb.Packet.create tid 0 ty data) -+ queue con (Xb.Packet.create tid 0 ty data) - - (* operations *) - let directory tid path con = queue_path Xb.Op.Directory tid path con -@@ -27,48 +28,48 @@ let read tid path con = queue_path Xb.Op.Read tid path con - let getperms tid path con = queue_path Xb.Op.Getperms tid path con - - let debug commands con = -- Xb.queue con (Xb.Packet.create 0 0 Xb.Op.Debug (data_concat commands)) -+ queue con (Xb.Packet.create 0 0 Xb.Op.Debug (data_concat commands)) - - let watch path data con = - let data = data_concat [ path; data; ] in -- Xb.queue con (Xb.Packet.create 0 0 Xb.Op.Watch data) -+ queue con (Xb.Packet.create 0 0 Xb.Op.Watch data) - - let unwatch path data con = - let data = data_concat [ path; data; ] in -- Xb.queue con (Xb.Packet.create 0 0 Xb.Op.Unwatch data) -+ queue con (Xb.Packet.create 0 0 Xb.Op.Unwatch data) - - let transaction_start con = -- Xb.queue con (Xb.Packet.create 0 0 Xb.Op.Transaction_start (data_concat [])) -+ queue con (Xb.Packet.create 0 0 Xb.Op.Transaction_start (data_concat [])) - - let transaction_end tid commit con = - let data = data_concat [ (if commit then "T" else "F"); ] in -- Xb.queue con (Xb.Packet.create tid 0 Xb.Op.Transaction_end data) -+ queue con (Xb.Packet.create tid 0 Xb.Op.Transaction_end data) - - let introduce domid mfn port con = - let data = data_concat [ Printf.sprintf "%u" domid; - Printf.sprintf "%nu" mfn; - string_of_int port; ] in -- Xb.queue con (Xb.Packet.create 0 0 Xb.Op.Introduce data) -+ queue con (Xb.Packet.create 0 0 Xb.Op.Introduce data) - - let release domid con = - let data = data_concat [ Printf.sprintf "%u" domid; ] in -- Xb.queue con (Xb.Packet.create 0 0 Xb.Op.Release data) -+ queue con (Xb.Packet.create 0 0 Xb.Op.Release data) - - let resume domid con = - let data = data_concat [ Printf.sprintf "%u" domid; ] in -- Xb.queue con (Xb.Packet.create 0 0 Xb.Op.Resume data) -+ queue con (Xb.Packet.create 0 0 Xb.Op.Resume data) - - let getdomainpath domid con = - let data = data_concat [ Printf.sprintf "%u" domid; ] in -- Xb.queue con (Xb.Packet.create 0 0 Xb.Op.Getdomainpath data) -+ queue con (Xb.Packet.create 0 0 Xb.Op.Getdomainpath data) - - let write tid path value con = - let data = path ^ "\000" ^ value (* no NULL at the end *) in -- Xb.queue con (Xb.Packet.create tid 0 Xb.Op.Write data) -+ queue con (Xb.Packet.create tid 0 Xb.Op.Write data) - - let mkdir tid path con = queue_path Xb.Op.Mkdir tid path con - let rm tid path con = queue_path Xb.Op.Rm tid path con - - let setperms tid path perms con = - let data = data_concat [ path; perms ] in -- Xb.queue con (Xb.Packet.create tid 0 Xb.Op.Setperms data) -+ queue con (Xb.Packet.create tid 0 Xb.Op.Setperms data) -diff --git a/tools/ocaml/libs/xs/xsraw.ml b/tools/ocaml/libs/xs/xsraw.ml -index 451f8b38dbcc..cbd17280600c 100644 ---- a/tools/ocaml/libs/xs/xsraw.ml -+++ b/tools/ocaml/libs/xs/xsraw.ml -@@ -36,8 +36,10 @@ type con = { - let close con = - Xb.close con.xb - -+let capacity = { Xb.maxoutstanding = 1; maxwatchevents = 0; } -+ - let open_fd fd = { -- xb = Xb.open_fd fd; -+ xb = Xb.open_fd ~capacity fd; - watchevents = Queue.create (); - } - -diff --git a/tools/ocaml/xenstored/connection.ml b/tools/ocaml/xenstored/connection.ml -index cc20e047d2b9..9624a5f9da2c 100644 ---- a/tools/ocaml/xenstored/connection.ml -+++ b/tools/ocaml/xenstored/connection.ml -@@ -20,12 +20,84 @@ open Stdext - - let xenstore_payload_max = 4096 (* xen/include/public/io/xs_wire.h *) - -+type 'a bounded_sender = 'a -> unit option -+(** a bounded sender accepts an ['a] item and returns: -+ None - if there is no room to accept the item -+ Some () - if it has successfully accepted/sent the item -+ *) -+ -+module BoundedPipe : sig -+ type 'a t -+ -+ (** [create ~capacity ~destination] creates a bounded pipe with a -+ local buffer holding at most [capacity] items. Once the buffer is -+ full it will not accept further items. items from the pipe are -+ flushed into [destination] as long as it accepts items. The -+ destination could be another pipe. -+ *) -+ val create: capacity:int -> destination:'a bounded_sender -> 'a t -+ -+ (** [is_empty t] returns whether the local buffer of [t] is empty. *) -+ val is_empty : _ t -> bool -+ -+ (** [length t] the number of items in the internal buffer *) -+ val length: _ t -> int -+ -+ (** [flush_pipe t] sends as many items from the local buffer as possible, -+ which could be none. *) -+ val flush_pipe: _ t -> unit -+ -+ (** [push t item] tries to [flush_pipe] and then push [item] -+ into the pipe if its [capacity] allows. -+ Returns [None] if there is no more room -+ *) -+ val push : 'a t -> 'a bounded_sender -+end = struct -+ (* items are enqueued in [q], and then flushed to [connect_to] *) -+ type 'a t = -+ { q: 'a Queue.t -+ ; destination: 'a bounded_sender -+ ; capacity: int -+ } -+ -+ let create ~capacity ~destination = -+ { q = Queue.create (); capacity; destination } -+ -+ let rec flush_pipe t = -+ if not Queue.(is_empty t.q) then -+ let item = Queue.peek t.q in -+ match t.destination item with -+ | None -> () (* no room *) -+ | Some () -> -+ (* successfully sent item to next stage *) -+ let _ = Queue.pop t.q in -+ (* continue trying to send more items *) -+ flush_pipe t -+ -+ let push t item = -+ (* first try to flush as many items from this pipe as possible to make room, -+ it is important to do this first to preserve the order of the items -+ *) -+ flush_pipe t; -+ if Queue.length t.q < t.capacity then begin -+ (* enqueue, instead of sending directly. -+ this ensures that [out] sees the items in the same order as we receive them -+ *) -+ Queue.push item t.q; -+ Some (flush_pipe t) -+ end else None -+ -+ let is_empty t = Queue.is_empty t.q -+ let length t = Queue.length t.q -+end -+ - type watch = { - con: t; - token: string; - path: string; - base: string; - is_relative: bool; -+ pending_watchevents: Xenbus.Xb.Packet.t BoundedPipe.t; - } - - and t = { -@@ -38,8 +110,36 @@ and t = { - anonid: int; - mutable stat_nb_ops: int; - mutable perm: Perms.Connection.t; -+ pending_source_watchevents: (watch * Xenbus.Xb.Packet.t) BoundedPipe.t - } - -+module Watch = struct -+ module T = struct -+ type t = watch -+ -+ let compare w1 w2 = -+ (* cannot compare watches from different connections *) -+ assert (w1.con == w2.con); -+ match String.compare w1.token w2.token with -+ | 0 -> String.compare w1.path w2.path -+ | n -> n -+ end -+ module Set = Set.Make(T) -+ -+ let flush_events t = -+ BoundedPipe.flush_pipe t.pending_watchevents; -+ not (BoundedPipe.is_empty t.pending_watchevents) -+ -+ let pending_watchevents t = -+ BoundedPipe.length t.pending_watchevents -+end -+ -+let source_flush_watchevents t = -+ BoundedPipe.flush_pipe t.pending_source_watchevents -+ -+let source_pending_watchevents t = -+ BoundedPipe.length t.pending_source_watchevents -+ - let mark_as_bad con = - match con.dom with - |None -> () -@@ -67,7 +167,8 @@ let watch_create ~con ~path ~token = { - token = token; - path = path; - base = get_path con; -- is_relative = path.[0] <> '/' && path.[0] <> '@' -+ is_relative = path.[0] <> '/' && path.[0] <> '@'; -+ pending_watchevents = BoundedPipe.create ~capacity:!Define.maxwatchevents ~destination:(Xenbus.Xb.queue con.xb) - } - - let get_con w = w.con -@@ -93,6 +194,9 @@ let make_perm dom = - Perms.Connection.create ~perms:[Perms.READ; Perms.WRITE] domid - - let create xbcon dom = -+ let destination (watch, pkt) = -+ BoundedPipe.push watch.pending_watchevents pkt -+ in - let id = - match dom with - | None -> let old = !anon_id_next in incr anon_id_next; old -@@ -109,6 +213,16 @@ let create xbcon dom = - anonid = id; - stat_nb_ops = 0; - perm = make_perm dom; -+ -+ (* the actual capacity will be lower, this is used as an overflow -+ buffer: anything that doesn't fit elsewhere gets put here, only -+ limited by the amount of watches that you can generate with a -+ single xenstore command (which is finite, although possibly very -+ large in theory for Dom0). Once the pipe here has any contents the -+ domain is blocked from sending more commands until it is empty -+ again though. -+ *) -+ pending_source_watchevents = BoundedPipe.create ~capacity:Sys.max_array_length ~destination - } - in - Logging.new_connection ~tid:Transaction.none ~con:(get_domstr con); -@@ -127,11 +241,17 @@ let set_target con target_domid = - - let is_backend_mmap con = Xenbus.Xb.is_mmap con.xb - --let send_reply con tid rid ty data = -+let packet_of con tid rid ty data = - if (String.length data) > xenstore_payload_max && (is_backend_mmap con) then -- Xenbus.Xb.queue con.xb (Xenbus.Xb.Packet.create tid rid Xenbus.Xb.Op.Error "E2BIG\000") -+ Xenbus.Xb.Packet.create tid rid Xenbus.Xb.Op.Error "E2BIG\000" - else -- Xenbus.Xb.queue con.xb (Xenbus.Xb.Packet.create tid rid ty data) -+ Xenbus.Xb.Packet.create tid rid ty data -+ -+let send_reply con tid rid ty data = -+ let result = Xenbus.Xb.queue con.xb (packet_of con tid rid ty data) in -+ (* should never happen: we only process an input packet when there is room for an output packet *) -+ (* and the limit for replies is different from the limit for watch events *) -+ assert (result <> None) - - let send_error con tid rid err = send_reply con tid rid Xenbus.Xb.Op.Error (err ^ "\000") - let send_ack con tid rid ty = send_reply con tid rid ty "OK\000" -@@ -181,11 +301,11 @@ let del_watch con path token = - apath, w - - let del_watches con = -- Hashtbl.clear con.watches; -+ Hashtbl.reset con.watches; - con.nb_watches <- 0 - - let del_transactions con = -- Hashtbl.clear con.transactions -+ Hashtbl.reset con.transactions - - let list_watches con = - let ll = Hashtbl.fold -@@ -208,21 +328,29 @@ let lookup_watch_perm path = function - let lookup_watch_perms oldroot root path = - lookup_watch_perm path oldroot @ lookup_watch_perm path (Some root) - --let fire_single_watch_unchecked watch = -+let fire_single_watch_unchecked source watch = - let data = Utils.join_by_null [watch.path; watch.token; ""] in -- send_reply watch.con Transaction.none 0 Xenbus.Xb.Op.Watchevent data -+ let pkt = packet_of watch.con Transaction.none 0 Xenbus.Xb.Op.Watchevent data in - --let fire_single_watch (oldroot, root) watch = -+ match BoundedPipe.push source.pending_source_watchevents (watch, pkt) with -+ | Some () -> () (* packet queued *) -+ | None -> -+ (* a well behaved Dom0 shouldn't be able to trigger this, -+ if it happens it is likely a Dom0 bug causing runaway memory usage -+ *) -+ failwith "watch event overflow, cannot happen" -+ -+let fire_single_watch source (oldroot, root) watch = - let abspath = get_watch_path watch.con watch.path |> Store.Path.of_string in - let perms = lookup_watch_perms oldroot root abspath in - if Perms.can_fire_watch watch.con.perm perms then -- fire_single_watch_unchecked watch -+ fire_single_watch_unchecked source watch - else - let perms = perms |> List.map (Perms.Node.to_string ~sep:" ") |> String.concat ", " in - let con = get_domstr watch.con in - Logging.watch_not_fired ~con perms (Store.Path.to_string abspath) - --let fire_watch roots watch path = -+let fire_watch source roots watch path = - let new_path = - if watch.is_relative && path.[0] = '/' - then begin -@@ -232,7 +360,7 @@ let fire_watch roots watch path = - end else - path - in -- fire_single_watch roots { watch with path = new_path } -+ fire_single_watch source roots { watch with path = new_path } - - (* Search for a valid unused transaction id. *) - let rec valid_transaction_id con proposed_id = -@@ -280,6 +408,7 @@ let do_input con = Xenbus.Xb.input con.xb - let has_partial_input con = Xenbus.Xb.has_partial_input con.xb - let has_more_input con = Xenbus.Xb.has_more_input con.xb - -+let can_input con = Xenbus.Xb.can_input con.xb && BoundedPipe.is_empty con.pending_source_watchevents - let has_output con = Xenbus.Xb.has_output con.xb - let has_old_output con = Xenbus.Xb.has_old_output con.xb - let has_new_output con = Xenbus.Xb.has_new_output con.xb -@@ -323,7 +452,7 @@ let prevents_live_update con = not (is_bad con) - && (has_extra_connection_data con || has_transaction_data con) - - let has_more_work con = -- has_more_input con || not (has_old_output con) && has_new_output con -+ (has_more_input con && can_input con) || not (has_old_output con) && has_new_output con - - let incr_ops con = con.stat_nb_ops <- con.stat_nb_ops + 1 - -diff --git a/tools/ocaml/xenstored/connections.ml b/tools/ocaml/xenstored/connections.ml -index 3c7429fe7f61..7d68c583b43a 100644 ---- a/tools/ocaml/xenstored/connections.ml -+++ b/tools/ocaml/xenstored/connections.ml -@@ -22,22 +22,30 @@ type t = { - domains: (int, Connection.t) Hashtbl.t; - ports: (Xeneventchn.t, Connection.t) Hashtbl.t; - mutable watches: Connection.watch list Trie.t; -+ mutable has_pending_watchevents: Connection.Watch.Set.t - } - - let create () = { - anonymous = Hashtbl.create 37; - domains = Hashtbl.create 37; - ports = Hashtbl.create 37; -- watches = Trie.create () -+ watches = Trie.create (); -+ has_pending_watchevents = Connection.Watch.Set.empty; - } - -+let get_capacity () = -+ (* not multiplied by maxwatch on purpose: 2nd queue in watch itself! *) -+ { Xenbus.Xb.maxoutstanding = !Define.maxoutstanding; maxwatchevents = !Define.maxwatchevents } -+ - let add_anonymous cons fd = -- let xbcon = Xenbus.Xb.open_fd fd in -+ let capacity = get_capacity () in -+ let xbcon = Xenbus.Xb.open_fd fd ~capacity in - let con = Connection.create xbcon None in - Hashtbl.add cons.anonymous (Xenbus.Xb.get_fd xbcon) con - - let add_domain cons dom = -- let xbcon = Xenbus.Xb.open_mmap (Domain.get_interface dom) (fun () -> Domain.notify dom) in -+ let capacity = get_capacity () in -+ let xbcon = Xenbus.Xb.open_mmap ~capacity (Domain.get_interface dom) (fun () -> Domain.notify dom) in - let con = Connection.create xbcon (Some dom) in - Hashtbl.add cons.domains (Domain.get_id dom) con; - match Domain.get_port dom with -@@ -48,7 +56,9 @@ let select ?(only_if = (fun _ -> true)) cons = - Hashtbl.fold (fun _ con (ins, outs) -> - if (only_if con) then ( - let fd = Connection.get_fd con in -- (fd :: ins, if Connection.has_output con then fd :: outs else outs) -+ let in_fds = if Connection.can_input con then fd :: ins else ins in -+ let out_fds = if Connection.has_output con then fd :: outs else outs in -+ in_fds, out_fds - ) else (ins, outs) - ) - cons.anonymous ([], []) -@@ -67,10 +77,17 @@ let del_watches_of_con con watches = - | [] -> None - | ws -> Some ws - -+let del_watches cons con = -+ Connection.del_watches con; -+ cons.watches <- Trie.map (del_watches_of_con con) cons.watches; -+ cons.has_pending_watchevents <- -+ cons.has_pending_watchevents |> Connection.Watch.Set.filter @@ fun w -> -+ Connection.get_con w != con -+ - let del_anonymous cons con = - try - Hashtbl.remove cons.anonymous (Connection.get_fd con); -- cons.watches <- Trie.map (del_watches_of_con con) cons.watches; -+ del_watches cons con; - Connection.close con - with exn -> - debug "del anonymous %s" (Printexc.to_string exn) -@@ -85,7 +102,7 @@ let del_domain cons id = - | Some p -> Hashtbl.remove cons.ports p - | None -> ()) - | None -> ()); -- cons.watches <- Trie.map (del_watches_of_con con) cons.watches; -+ del_watches cons con; - Connection.close con - with exn -> - debug "del domain %u: %s" id (Printexc.to_string exn) -@@ -136,31 +153,33 @@ let del_watch cons con path token = - cons.watches <- Trie.set cons.watches key watches; - watch - --let del_watches cons con = -- Connection.del_watches con; -- cons.watches <- Trie.map (del_watches_of_con con) cons.watches -- - (* path is absolute *) --let fire_watches ?oldroot root cons path recurse = -+let fire_watches ?oldroot source root cons path recurse = - let key = key_of_path path in - let path = Store.Path.to_string path in - let roots = oldroot, root in - let fire_watch _ = function - | None -> () -- | Some watches -> List.iter (fun w -> Connection.fire_watch roots w path) watches -+ | Some watches -> List.iter (fun w -> Connection.fire_watch source roots w path) watches - in - let fire_rec _x = function - | None -> () - | Some watches -> -- List.iter (Connection.fire_single_watch roots) watches -+ List.iter (Connection.fire_single_watch source roots) watches - in - Trie.iter_path fire_watch cons.watches key; - if recurse then - Trie.iter fire_rec (Trie.sub cons.watches key) - -+let send_watchevents cons con = -+ cons.has_pending_watchevents <- -+ cons.has_pending_watchevents |> Connection.Watch.Set.filter Connection.Watch.flush_events; -+ Connection.source_flush_watchevents con -+ - let fire_spec_watches root cons specpath = -+ let source = find_domain cons 0 in - iter cons (fun con -> -- List.iter (Connection.fire_single_watch (None, root)) (Connection.get_watches con specpath)) -+ List.iter (Connection.fire_single_watch source (None, root)) (Connection.get_watches con specpath)) - - let set_target cons domain target_domain = - let con = find_domain cons domain in -@@ -197,6 +216,16 @@ let debug cons = - let domains = Hashtbl.fold (fun _ con accu -> Connection.debug con :: accu) cons.domains [] in - String.concat "" (domains @ anonymous) - -+let debug_watchevents cons con = -+ (* == (physical equality) -+ has to be used here because w.con.xb.backend might contain a [unit->unit] value causing regular -+ comparison to fail due to having a 'functional value' which cannot be compared. -+ *) -+ let s = cons.has_pending_watchevents |> Connection.Watch.Set.filter (fun w -> w.con == con) in -+ let pending = s |> Connection.Watch.Set.elements -+ |> List.map (fun w -> Connection.Watch.pending_watchevents w) |> List.fold_left (+) 0 in -+ Printf.sprintf "Watches with pending events: %d, pending events total: %d" (Connection.Watch.Set.cardinal s) pending -+ - let filter ~f cons = - let fold _ v acc = if f v then v :: acc else acc in - [] -diff --git a/tools/ocaml/xenstored/define.ml b/tools/ocaml/xenstored/define.ml -index ba63a8147e09..327b6d795ec7 100644 ---- a/tools/ocaml/xenstored/define.ml -+++ b/tools/ocaml/xenstored/define.ml -@@ -24,6 +24,13 @@ let default_config_dir = Paths.xen_config_dir - let maxwatch = ref (100) - let maxtransaction = ref (10) - let maxrequests = ref (1024) (* maximum requests per transaction *) -+let maxoutstanding = ref (1024) (* maximum outstanding requests, i.e. in-flight requests / domain *) -+let maxwatchevents = ref (1024) -+(* -+ maximum outstanding watch events per watch, -+ recommended >= maxoutstanding to avoid blocking backend transactions due to -+ malicious frontends -+ *) - - let gc_max_overhead = ref 120 (* 120% see comment in xenstored.ml *) - let conflict_burst_limit = ref 5.0 -diff --git a/tools/ocaml/xenstored/oxenstored.conf.in b/tools/ocaml/xenstored/oxenstored.conf.in -index 4ae48e42d47d..9d034e744b4b 100644 ---- a/tools/ocaml/xenstored/oxenstored.conf.in -+++ b/tools/ocaml/xenstored/oxenstored.conf.in -@@ -62,6 +62,8 @@ quota-maxwatch = 100 - quota-transaction = 10 - quota-maxrequests = 1024 - quota-path-max = 1024 -+quota-maxoutstanding = 1024 -+quota-maxwatchevents = 1024 - - # Activate filed base backend - persistent = false -diff --git a/tools/ocaml/xenstored/process.ml b/tools/ocaml/xenstored/process.ml -index cbf708213796..ce39ce28b5f3 100644 ---- a/tools/ocaml/xenstored/process.ml -+++ b/tools/ocaml/xenstored/process.ml -@@ -57,7 +57,7 @@ let split_one_path data con = - | path :: "" :: [] -> Store.Path.create path (Connection.get_path con) - | _ -> raise Invalid_Cmd_Args - --let process_watch t cons = -+let process_watch source t cons = - let oldroot = t.Transaction.oldroot in - let newroot = Store.get_root t.store in - let ops = Transaction.get_paths t |> List.rev in -@@ -67,8 +67,9 @@ let process_watch t cons = - | Xenbus.Xb.Op.Rm -> true, None, oldroot - | Xenbus.Xb.Op.Setperms -> false, Some oldroot, newroot - | _ -> raise (Failure "huh ?") in -- Connections.fire_watches ?oldroot root cons (snd op) recurse in -- List.iter (fun op -> do_op_watch op cons) ops -+ Connections.fire_watches ?oldroot source root cons (snd op) recurse in -+ List.iter (fun op -> do_op_watch op cons) ops; -+ Connections.send_watchevents cons source - - let create_implicit_path t perm path = - let dirname = Store.Path.get_parent path in -@@ -234,6 +235,20 @@ let do_debug con t _domains cons data = - | "watches" :: _ -> - let watches = Connections.debug cons in - Some (watches ^ "\000") -+ | "xenbus" :: domid :: _ -> -+ let domid = int_of_string domid in -+ let con = Connections.find_domain cons domid in -+ let s = Printf.sprintf "xenbus: %s; overflow queue length: %d, can_input: %b, has_more_input: %b, has_old_output: %b, has_new_output: %b, has_more_work: %b. pending: %s" -+ (Xenbus.Xb.debug con.xb) -+ (Connection.source_pending_watchevents con) -+ (Connection.can_input con) -+ (Connection.has_more_input con) -+ (Connection.has_old_output con) -+ (Connection.has_new_output con) -+ (Connection.has_more_work con) -+ (Connections.debug_watchevents cons con) -+ in -+ Some s - | "mfn" :: domid :: _ -> - let domid = int_of_string domid in - let con = Connections.find_domain cons domid in -@@ -342,7 +357,7 @@ let reply_ack fct con t doms cons data = - fct con t doms cons data; - Packet.Ack (fun () -> - if Transaction.get_id t = Transaction.none then -- process_watch t cons -+ process_watch con t cons - ) - - let reply_data fct con t doms cons data = -@@ -501,7 +516,7 @@ let do_watch con t _domains cons data = - Packet.Ack (fun () -> - (* xenstore.txt says this watch is fired immediately, - implying even if path doesn't exist or is unreadable *) -- Connection.fire_single_watch_unchecked watch) -+ Connection.fire_single_watch_unchecked con watch) - - let do_unwatch con _t _domains cons data = - let (node, token) = -@@ -532,7 +547,7 @@ let do_transaction_end con t domains cons data = - if not success then - raise Transaction_again; - if commit then begin -- process_watch t cons; -+ process_watch con t cons; - match t.Transaction.ty with - | Transaction.No -> - () (* no need to record anything *) -@@ -700,7 +715,8 @@ let process_packet ~store ~cons ~doms ~con ~req = - let do_input store cons doms con = - let newpacket = - try -- Connection.do_input con -+ if Connection.can_input con then Connection.do_input con -+ else None - with Xenbus.Xb.Reconnect -> - info "%s requests a reconnect" (Connection.get_domstr con); - History.reconnect con; -@@ -728,6 +744,7 @@ let do_input store cons doms con = - Connection.incr_ops con - - let do_output _store _cons _doms con = -+ Connection.source_flush_watchevents con; - if Connection.has_output con then ( - if Connection.has_new_output con then ( - let packet = Connection.peek_output con in -diff --git a/tools/ocaml/xenstored/xenstored.ml b/tools/ocaml/xenstored/xenstored.ml -index 3b57ad016dfb..c799e20f1145 100644 ---- a/tools/ocaml/xenstored/xenstored.ml -+++ b/tools/ocaml/xenstored/xenstored.ml -@@ -103,6 +103,8 @@ let parse_config filename = - ("quota-maxentity", Config.Set_int Quota.maxent); - ("quota-maxsize", Config.Set_int Quota.maxsize); - ("quota-maxrequests", Config.Set_int Define.maxrequests); -+ ("quota-maxoutstanding", Config.Set_int Define.maxoutstanding); -+ ("quota-maxwatchevents", Config.Set_int Define.maxwatchevents); - ("quota-path-max", Config.Set_int Define.path_max); - ("gc-max-overhead", Config.Set_int Define.gc_max_overhead); - ("test-eagain", Config.Set_bool Transaction.test_eagain); --- -2.37.4 - diff --git a/0069-SUPPORT.md-clarify-support-of-untrusted-driver-domai.patch b/0069-SUPPORT.md-clarify-support-of-untrusted-driver-domai.patch deleted file mode 100644 index 5660b02..0000000 --- a/0069-SUPPORT.md-clarify-support-of-untrusted-driver-domai.patch +++ /dev/null @@ -1,55 +0,0 @@ -From a026fddf89420dd25c5a9574d88aeab7c5711f6c Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Thu, 29 Sep 2022 13:07:35 +0200 -Subject: [PATCH 69/87] SUPPORT.md: clarify support of untrusted driver domains - with oxenstored - -Add a support statement for the scope of support regarding different -Xenstore variants. Especially oxenstored does not (yet) have security -support of untrusted driver domains, as those might drive oxenstored -out of memory by creating lots of watch events for the guests they are -servicing. - -Add a statement regarding Live Update support of oxenstored. - -This is part of XSA-326. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Acked-by: George Dunlap <george.dunlap@citrix.com> -Acked-by: Julien Grall <jgrall@amazon.com> -Reviewed-by: Christian Lindig <christian.lindig@citrix.com> -(cherry picked from commit c7bc20d8d123851a468402bbfc9e3330efff21ec) ---- - SUPPORT.md | 13 +++++++++---- - 1 file changed, 9 insertions(+), 4 deletions(-) - -diff --git a/SUPPORT.md b/SUPPORT.md -index 85726102eab8..7d0cb34c8f6f 100644 ---- a/SUPPORT.md -+++ b/SUPPORT.md -@@ -179,13 +179,18 @@ Support for running qemu-xen device model in a linux stubdomain. - - Status: Tech Preview - --## Liveupdate of C xenstored daemon -+## Xenstore - -- Status: Tech Preview -+### C xenstored daemon - --## Liveupdate of OCaml xenstored daemon -+ Status: Supported -+ Status, Liveupdate: Tech Preview - -- Status: Tech Preview -+### OCaml xenstored daemon -+ -+ Status: Supported -+ Status, untrusted driver domains: Supported, not security supported -+ Status, Liveupdate: Not functional - - ## Toolstack/3rd party - --- -2.37.4 - diff --git a/0070-tools-xenstore-don-t-use-conn-in-as-context-for-temp.patch b/0070-tools-xenstore-don-t-use-conn-in-as-context-for-temp.patch deleted file mode 100644 index 434ad0c..0000000 --- a/0070-tools-xenstore-don-t-use-conn-in-as-context-for-temp.patch +++ /dev/null @@ -1,718 +0,0 @@ -From c758765e464e166b5495c76466facc79584bbe1e Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:10 +0200 -Subject: [PATCH 70/87] tools/xenstore: don't use conn->in as context for - temporary allocations - -Using the struct buffered data pointer of the current processed request -for temporary data allocations has a major drawback: the used area (and -with that the temporary data) is freed only after the response of the -request has been written to the ring page or has been read via the -socket. This can happen much later in case a guest isn't reading its -responses fast enough. - -As the temporary data can be safely freed after creating the response, -add a temporary context for that purpose and use that for allocating -the temporary memory, as it was already the case before commit -cc0612464896 ("xenstore: add small default data buffer to internal -struct"). - -Some sub-functions need to gain the "const" attribute for the talloc -context. - -This is XSA-416 / CVE-2022-42319. - -Fixes: cc0612464896 ("xenstore: add small default data buffer to internal struct") -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit 2a587de219cc0765330fbf9fac6827bfaf29e29b) ---- - tools/xenstore/xenstored_control.c | 31 ++++++----- - tools/xenstore/xenstored_control.h | 3 +- - tools/xenstore/xenstored_core.c | 76 ++++++++++++++++---------- - tools/xenstore/xenstored_domain.c | 29 ++++++---- - tools/xenstore/xenstored_domain.h | 21 ++++--- - tools/xenstore/xenstored_transaction.c | 14 +++-- - tools/xenstore/xenstored_transaction.h | 6 +- - tools/xenstore/xenstored_watch.c | 9 +-- - tools/xenstore/xenstored_watch.h | 6 +- - 9 files changed, 118 insertions(+), 77 deletions(-) - -diff --git a/tools/xenstore/xenstored_control.c b/tools/xenstore/xenstored_control.c -index 1031a81c3874..d0350c6ad861 100644 ---- a/tools/xenstore/xenstored_control.c -+++ b/tools/xenstore/xenstored_control.c -@@ -155,7 +155,7 @@ bool lu_is_pending(void) - - struct cmd_s { - char *cmd; -- int (*func)(void *, struct connection *, char **, int); -+ int (*func)(const void *, struct connection *, char **, int); - char *pars; - /* - * max_pars can be used to limit the size of the parameter vector, -@@ -167,7 +167,7 @@ struct cmd_s { - unsigned int max_pars; - }; - --static int do_control_check(void *ctx, struct connection *conn, -+static int do_control_check(const void *ctx, struct connection *conn, - char **vec, int num) - { - if (num) -@@ -179,7 +179,7 @@ static int do_control_check(void *ctx, struct connection *conn, - return 0; - } - --static int do_control_log(void *ctx, struct connection *conn, -+static int do_control_log(const void *ctx, struct connection *conn, - char **vec, int num) - { - if (num != 1) -@@ -281,7 +281,7 @@ static int quota_get(const void *ctx, struct connection *conn, - return domain_get_quota(ctx, conn, atoi(vec[0])); - } - --static int do_control_quota(void *ctx, struct connection *conn, -+static int do_control_quota(const void *ctx, struct connection *conn, - char **vec, int num) - { - if (num == 0) -@@ -293,7 +293,7 @@ static int do_control_quota(void *ctx, struct connection *conn, - return quota_get(ctx, conn, vec, num); - } - --static int do_control_quota_s(void *ctx, struct connection *conn, -+static int do_control_quota_s(const void *ctx, struct connection *conn, - char **vec, int num) - { - if (num == 0) -@@ -306,7 +306,7 @@ static int do_control_quota_s(void *ctx, struct connection *conn, - } - - #ifdef __MINIOS__ --static int do_control_memreport(void *ctx, struct connection *conn, -+static int do_control_memreport(const void *ctx, struct connection *conn, - char **vec, int num) - { - if (num) -@@ -318,7 +318,7 @@ static int do_control_memreport(void *ctx, struct connection *conn, - return 0; - } - #else --static int do_control_logfile(void *ctx, struct connection *conn, -+static int do_control_logfile(const void *ctx, struct connection *conn, - char **vec, int num) - { - if (num != 1) -@@ -333,7 +333,7 @@ static int do_control_logfile(void *ctx, struct connection *conn, - return 0; - } - --static int do_control_memreport(void *ctx, struct connection *conn, -+static int do_control_memreport(const void *ctx, struct connection *conn, - char **vec, int num) - { - FILE *fp; -@@ -373,7 +373,7 @@ static int do_control_memreport(void *ctx, struct connection *conn, - } - #endif - --static int do_control_print(void *ctx, struct connection *conn, -+static int do_control_print(const void *ctx, struct connection *conn, - char **vec, int num) - { - if (num != 1) -@@ -875,7 +875,7 @@ static const char *lu_start(const void *ctx, struct connection *conn, - return NULL; - } - --static int do_control_lu(void *ctx, struct connection *conn, -+static int do_control_lu(const void *ctx, struct connection *conn, - char **vec, int num) - { - const char *ret = NULL; -@@ -922,7 +922,7 @@ static int do_control_lu(void *ctx, struct connection *conn, - } - #endif - --static int do_control_help(void *, struct connection *, char **, int); -+static int do_control_help(const void *, struct connection *, char **, int); - - static struct cmd_s cmds[] = { - { "check", do_control_check, "" }, -@@ -961,7 +961,7 @@ static struct cmd_s cmds[] = { - { "help", do_control_help, "" }, - }; - --static int do_control_help(void *ctx, struct connection *conn, -+static int do_control_help(const void *ctx, struct connection *conn, - char **vec, int num) - { - int cmd, len = 0; -@@ -997,7 +997,8 @@ static int do_control_help(void *ctx, struct connection *conn, - return 0; - } - --int do_control(struct connection *conn, struct buffered_data *in) -+int do_control(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - unsigned int cmd, num, off; - char **vec = NULL; -@@ -1017,11 +1018,11 @@ int do_control(struct connection *conn, struct buffered_data *in) - num = xs_count_strings(in->buffer, in->used); - if (cmds[cmd].max_pars) - num = min(num, cmds[cmd].max_pars); -- vec = talloc_array(in, char *, num); -+ vec = talloc_array(ctx, char *, num); - if (!vec) - return ENOMEM; - if (get_strings(in, vec, num) < num) - return EIO; - -- return cmds[cmd].func(in, conn, vec + 1, num - 1); -+ return cmds[cmd].func(ctx, conn, vec + 1, num - 1); - } -diff --git a/tools/xenstore/xenstored_control.h b/tools/xenstore/xenstored_control.h -index 98b6fbcea2b1..a8cb76559ba1 100644 ---- a/tools/xenstore/xenstored_control.h -+++ b/tools/xenstore/xenstored_control.h -@@ -16,7 +16,8 @@ - along with this program; If not, see <http://www.gnu.org/licenses/>. - */ - --int do_control(struct connection *conn, struct buffered_data *in); -+int do_control(const void *ctx, struct connection *conn, -+ struct buffered_data *in); - void lu_read_state(void); - - struct connection *lu_get_connection(void); -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 16504de42017..411cc0e44714 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -1248,11 +1248,13 @@ static struct node *get_node_canonicalized(struct connection *conn, - return get_node(conn, ctx, *canonical_name, perm); - } - --static int send_directory(struct connection *conn, struct buffered_data *in) -+static int send_directory(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - struct node *node; - -- node = get_node_canonicalized(conn, in, onearg(in), NULL, XS_PERM_READ); -+ node = get_node_canonicalized(conn, ctx, onearg(in), NULL, -+ XS_PERM_READ); - if (!node) - return errno; - -@@ -1261,7 +1263,7 @@ static int send_directory(struct connection *conn, struct buffered_data *in) - return 0; - } - --static int send_directory_part(struct connection *conn, -+static int send_directory_part(const void *ctx, struct connection *conn, - struct buffered_data *in) - { - unsigned int off, len, maxlen, genlen; -@@ -1273,7 +1275,8 @@ static int send_directory_part(struct connection *conn, - return EINVAL; - - /* First arg is node name. */ -- node = get_node_canonicalized(conn, in, in->buffer, NULL, XS_PERM_READ); -+ node = get_node_canonicalized(conn, ctx, in->buffer, NULL, -+ XS_PERM_READ); - if (!node) - return errno; - -@@ -1300,7 +1303,7 @@ static int send_directory_part(struct connection *conn, - break; - } - -- data = talloc_array(in, char, genlen + len + 1); -+ data = talloc_array(ctx, char, genlen + len + 1); - if (!data) - return ENOMEM; - -@@ -1316,11 +1319,13 @@ static int send_directory_part(struct connection *conn, - return 0; - } - --static int do_read(struct connection *conn, struct buffered_data *in) -+static int do_read(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - struct node *node; - -- node = get_node_canonicalized(conn, in, onearg(in), NULL, XS_PERM_READ); -+ node = get_node_canonicalized(conn, ctx, onearg(in), NULL, -+ XS_PERM_READ); - if (!node) - return errno; - -@@ -1510,7 +1515,8 @@ err: - } - - /* path, data... */ --static int do_write(struct connection *conn, struct buffered_data *in) -+static int do_write(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - unsigned int offset, datalen; - struct node *node; -@@ -1524,12 +1530,12 @@ static int do_write(struct connection *conn, struct buffered_data *in) - offset = strlen(vec[0]) + 1; - datalen = in->used - offset; - -- node = get_node_canonicalized(conn, in, vec[0], &name, XS_PERM_WRITE); -+ node = get_node_canonicalized(conn, ctx, vec[0], &name, XS_PERM_WRITE); - if (!node) { - /* No permissions, invalid input? */ - if (errno != ENOENT) - return errno; -- node = create_node(conn, in, name, in->buffer + offset, -+ node = create_node(conn, ctx, name, in->buffer + offset, - datalen); - if (!node) - return errno; -@@ -1540,18 +1546,19 @@ static int do_write(struct connection *conn, struct buffered_data *in) - return errno; - } - -- fire_watches(conn, in, name, node, false, NULL); -+ fire_watches(conn, ctx, name, node, false, NULL); - send_ack(conn, XS_WRITE); - - return 0; - } - --static int do_mkdir(struct connection *conn, struct buffered_data *in) -+static int do_mkdir(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - struct node *node; - char *name; - -- node = get_node_canonicalized(conn, in, onearg(in), &name, -+ node = get_node_canonicalized(conn, ctx, onearg(in), &name, - XS_PERM_WRITE); - - /* If it already exists, fine. */ -@@ -1561,10 +1568,10 @@ static int do_mkdir(struct connection *conn, struct buffered_data *in) - return errno; - if (!name) - return ENOMEM; -- node = create_node(conn, in, name, NULL, 0); -+ node = create_node(conn, ctx, name, NULL, 0); - if (!node) - return errno; -- fire_watches(conn, in, name, node, false, NULL); -+ fire_watches(conn, ctx, name, node, false, NULL); - } - send_ack(conn, XS_MKDIR); - -@@ -1662,24 +1669,25 @@ static int _rm(struct connection *conn, const void *ctx, struct node *node, - } - - --static int do_rm(struct connection *conn, struct buffered_data *in) -+static int do_rm(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - struct node *node; - int ret; - char *name; - char *parentname; - -- node = get_node_canonicalized(conn, in, onearg(in), &name, -+ node = get_node_canonicalized(conn, ctx, onearg(in), &name, - XS_PERM_WRITE); - if (!node) { - /* Didn't exist already? Fine, if parent exists. */ - if (errno == ENOENT) { - if (!name) - return ENOMEM; -- parentname = get_parent(in, name); -+ parentname = get_parent(ctx, name); - if (!parentname) - return errno; -- node = read_node(conn, in, parentname); -+ node = read_node(conn, ctx, parentname); - if (node) { - send_ack(conn, XS_RM); - return 0; -@@ -1694,7 +1702,7 @@ static int do_rm(struct connection *conn, struct buffered_data *in) - if (streq(name, "/")) - return EINVAL; - -- ret = _rm(conn, in, node, name); -+ ret = _rm(conn, ctx, node, name); - if (ret) - return ret; - -@@ -1704,13 +1712,15 @@ static int do_rm(struct connection *conn, struct buffered_data *in) - } - - --static int do_get_perms(struct connection *conn, struct buffered_data *in) -+static int do_get_perms(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - struct node *node; - char *strings; - unsigned int len; - -- node = get_node_canonicalized(conn, in, onearg(in), NULL, XS_PERM_READ); -+ node = get_node_canonicalized(conn, ctx, onearg(in), NULL, -+ XS_PERM_READ); - if (!node) - return errno; - -@@ -1723,7 +1733,8 @@ static int do_get_perms(struct connection *conn, struct buffered_data *in) - return 0; - } - --static int do_set_perms(struct connection *conn, struct buffered_data *in) -+static int do_set_perms(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - struct node_perms perms, old_perms; - char *name, *permstr; -@@ -1740,7 +1751,7 @@ static int do_set_perms(struct connection *conn, struct buffered_data *in) - - permstr = in->buffer + strlen(in->buffer) + 1; - -- perms.p = talloc_array(in, struct xs_permissions, perms.num); -+ perms.p = talloc_array(ctx, struct xs_permissions, perms.num); - if (!perms.p) - return ENOMEM; - if (!xs_strings_to_perms(perms.p, perms.num, permstr)) -@@ -1755,7 +1766,7 @@ static int do_set_perms(struct connection *conn, struct buffered_data *in) - } - - /* We must own node to do this (tools can do this too). */ -- node = get_node_canonicalized(conn, in, in->buffer, &name, -+ node = get_node_canonicalized(conn, ctx, in->buffer, &name, - XS_PERM_WRITE | XS_PERM_OWNER); - if (!node) - return errno; -@@ -1790,7 +1801,7 @@ static int do_set_perms(struct connection *conn, struct buffered_data *in) - return errno; - } - -- fire_watches(conn, in, name, node, false, &old_perms); -+ fire_watches(conn, ctx, name, node, false, &old_perms); - send_ack(conn, XS_SET_PERMS); - - return 0; -@@ -1798,7 +1809,8 @@ static int do_set_perms(struct connection *conn, struct buffered_data *in) - - static struct { - const char *str; -- int (*func)(struct connection *conn, struct buffered_data *in); -+ int (*func)(const void *ctx, struct connection *conn, -+ struct buffered_data *in); - unsigned int flags; - #define XS_FLAG_NOTID (1U << 0) /* Ignore transaction id. */ - #define XS_FLAG_PRIV (1U << 1) /* Privileged domain only. */ -@@ -1874,6 +1886,7 @@ static void process_message(struct connection *conn, struct buffered_data *in) - struct transaction *trans; - enum xsd_sockmsg_type type = in->hdr.msg.type; - int ret; -+ void *ctx; - - /* At least send_error() and send_reply() expects conn->in == in */ - assert(conn->in == in); -@@ -1898,10 +1911,17 @@ static void process_message(struct connection *conn, struct buffered_data *in) - return; - } - -+ ctx = talloc_new(NULL); -+ if (!ctx) { -+ send_error(conn, ENOMEM); -+ return; -+ } -+ - assert(conn->transaction == NULL); - conn->transaction = trans; - -- ret = wire_funcs[type].func(conn, in); -+ ret = wire_funcs[type].func(ctx, conn, in); -+ talloc_free(ctx); - if (ret) - send_error(conn, ret); - -diff --git a/tools/xenstore/xenstored_domain.c b/tools/xenstore/xenstored_domain.c -index e7c6886ccf47..fb732d0a14c3 100644 ---- a/tools/xenstore/xenstored_domain.c -+++ b/tools/xenstore/xenstored_domain.c -@@ -330,7 +330,7 @@ bool domain_is_unprivileged(struct connection *conn) - domid_is_unprivileged(conn->domain->domid); - } - --static char *talloc_domain_path(void *context, unsigned int domid) -+static char *talloc_domain_path(const void *context, unsigned int domid) - { - return talloc_asprintf(context, "/local/domain/%u", domid); - } -@@ -534,7 +534,8 @@ static struct domain *introduce_domain(const void *ctx, - } - - /* domid, gfn, evtchn, path */ --int do_introduce(struct connection *conn, struct buffered_data *in) -+int do_introduce(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - struct domain *domain; - char *vec[3]; -@@ -552,7 +553,7 @@ int do_introduce(struct connection *conn, struct buffered_data *in) - if (port <= 0) - return EINVAL; - -- domain = introduce_domain(in, domid, port, false); -+ domain = introduce_domain(ctx, domid, port, false); - if (!domain) - return errno; - -@@ -575,7 +576,8 @@ static struct domain *find_connected_domain(unsigned int domid) - return domain; - } - --int do_set_target(struct connection *conn, struct buffered_data *in) -+int do_set_target(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - char *vec[2]; - unsigned int domid, tdomid; -@@ -619,7 +621,8 @@ static struct domain *onearg_domain(struct connection *conn, - } - - /* domid */ --int do_release(struct connection *conn, struct buffered_data *in) -+int do_release(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - struct domain *domain; - -@@ -634,7 +637,8 @@ int do_release(struct connection *conn, struct buffered_data *in) - return 0; - } - --int do_resume(struct connection *conn, struct buffered_data *in) -+int do_resume(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - struct domain *domain; - -@@ -649,7 +653,8 @@ int do_resume(struct connection *conn, struct buffered_data *in) - return 0; - } - --int do_get_domain_path(struct connection *conn, struct buffered_data *in) -+int do_get_domain_path(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - char *path; - const char *domid_str = onearg(in); -@@ -657,18 +662,17 @@ int do_get_domain_path(struct connection *conn, struct buffered_data *in) - if (!domid_str) - return EINVAL; - -- path = talloc_domain_path(conn, atoi(domid_str)); -+ path = talloc_domain_path(ctx, atoi(domid_str)); - if (!path) - return errno; - - send_reply(conn, XS_GET_DOMAIN_PATH, path, strlen(path) + 1); - -- talloc_free(path); -- - return 0; - } - --int do_is_domain_introduced(struct connection *conn, struct buffered_data *in) -+int do_is_domain_introduced(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - int result; - unsigned int domid; -@@ -689,7 +693,8 @@ int do_is_domain_introduced(struct connection *conn, struct buffered_data *in) - } - - /* Allow guest to reset all watches */ --int do_reset_watches(struct connection *conn, struct buffered_data *in) -+int do_reset_watches(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - conn_delete_all_watches(conn); - conn_delete_all_transactions(conn); -diff --git a/tools/xenstore/xenstored_domain.h b/tools/xenstore/xenstored_domain.h -index 904faa923afb..b9e152890149 100644 ---- a/tools/xenstore/xenstored_domain.h -+++ b/tools/xenstore/xenstored_domain.h -@@ -24,25 +24,32 @@ void handle_event(void); - void check_domains(void); - - /* domid, mfn, eventchn, path */ --int do_introduce(struct connection *conn, struct buffered_data *in); -+int do_introduce(const void *ctx, struct connection *conn, -+ struct buffered_data *in); - - /* domid */ --int do_is_domain_introduced(struct connection *conn, struct buffered_data *in); -+int do_is_domain_introduced(const void *ctx, struct connection *conn, -+ struct buffered_data *in); - - /* domid */ --int do_release(struct connection *conn, struct buffered_data *in); -+int do_release(const void *ctx, struct connection *conn, -+ struct buffered_data *in); - - /* domid */ --int do_resume(struct connection *conn, struct buffered_data *in); -+int do_resume(const void *ctx, struct connection *conn, -+ struct buffered_data *in); - - /* domid, target */ --int do_set_target(struct connection *conn, struct buffered_data *in); -+int do_set_target(const void *ctx, struct connection *conn, -+ struct buffered_data *in); - - /* domid */ --int do_get_domain_path(struct connection *conn, struct buffered_data *in); -+int do_get_domain_path(const void *ctx, struct connection *conn, -+ struct buffered_data *in); - - /* Allow guest to reset all watches */ --int do_reset_watches(struct connection *conn, struct buffered_data *in); -+int do_reset_watches(const void *ctx, struct connection *conn, -+ struct buffered_data *in); - - void domain_init(int evtfd); - void dom0_init(void); -diff --git a/tools/xenstore/xenstored_transaction.c b/tools/xenstore/xenstored_transaction.c -index 28774813de83..3e3eb47326cc 100644 ---- a/tools/xenstore/xenstored_transaction.c -+++ b/tools/xenstore/xenstored_transaction.c -@@ -481,7 +481,8 @@ struct transaction *transaction_lookup(struct connection *conn, uint32_t id) - return ERR_PTR(-ENOENT); - } - --int do_transaction_start(struct connection *conn, struct buffered_data *in) -+int do_transaction_start(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - struct transaction *trans, *exists; - char id_str[20]; -@@ -494,8 +495,8 @@ int do_transaction_start(struct connection *conn, struct buffered_data *in) - conn->transaction_started > quota_max_transaction) - return ENOSPC; - -- /* Attach transaction to input for autofree until it's complete */ -- trans = talloc_zero(in, struct transaction); -+ /* Attach transaction to ctx for autofree until it's complete */ -+ trans = talloc_zero(ctx, struct transaction); - if (!trans) - return ENOMEM; - -@@ -544,7 +545,8 @@ static int transaction_fix_domains(struct transaction *trans, bool update) - return 0; - } - --int do_transaction_end(struct connection *conn, struct buffered_data *in) -+int do_transaction_end(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - const char *arg = onearg(in); - struct transaction *trans; -@@ -562,8 +564,8 @@ int do_transaction_end(struct connection *conn, struct buffered_data *in) - if (!conn->transaction_started) - conn->ta_start_time = 0; - -- /* Attach transaction to in for auto-cleanup */ -- talloc_steal(in, trans); -+ /* Attach transaction to ctx for auto-cleanup */ -+ talloc_steal(ctx, trans); - - if (streq(arg, "T")) { - if (trans->fail) -diff --git a/tools/xenstore/xenstored_transaction.h b/tools/xenstore/xenstored_transaction.h -index e3cbd6b23095..39d7f81c5127 100644 ---- a/tools/xenstore/xenstored_transaction.h -+++ b/tools/xenstore/xenstored_transaction.h -@@ -29,8 +29,10 @@ struct transaction; - - extern uint64_t generation; - --int do_transaction_start(struct connection *conn, struct buffered_data *node); --int do_transaction_end(struct connection *conn, struct buffered_data *in); -+int do_transaction_start(const void *ctx, struct connection *conn, -+ struct buffered_data *node); -+int do_transaction_end(const void *ctx, struct connection *conn, -+ struct buffered_data *in); - - struct transaction *transaction_lookup(struct connection *conn, uint32_t id); - -diff --git a/tools/xenstore/xenstored_watch.c b/tools/xenstore/xenstored_watch.c -index 85362bcce314..316c08b7f754 100644 ---- a/tools/xenstore/xenstored_watch.c -+++ b/tools/xenstore/xenstored_watch.c -@@ -243,7 +243,7 @@ static struct watch *add_watch(struct connection *conn, char *path, char *token, - return NULL; - } - --int do_watch(struct connection *conn, struct buffered_data *in) -+int do_watch(const void *ctx, struct connection *conn, struct buffered_data *in) - { - struct watch *watch; - char *vec[2]; -@@ -252,7 +252,7 @@ int do_watch(struct connection *conn, struct buffered_data *in) - if (get_strings(in, vec, ARRAY_SIZE(vec)) != ARRAY_SIZE(vec)) - return EINVAL; - -- errno = check_watch_path(conn, in, &(vec[0]), &relative); -+ errno = check_watch_path(conn, ctx, &(vec[0]), &relative); - if (errno) - return errno; - -@@ -283,7 +283,8 @@ int do_watch(struct connection *conn, struct buffered_data *in) - return 0; - } - --int do_unwatch(struct connection *conn, struct buffered_data *in) -+int do_unwatch(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - struct watch *watch; - char *node, *vec[2]; -@@ -291,7 +292,7 @@ int do_unwatch(struct connection *conn, struct buffered_data *in) - if (get_strings(in, vec, ARRAY_SIZE(vec)) != ARRAY_SIZE(vec)) - return EINVAL; - -- node = canonicalize(conn, in, vec[0]); -+ node = canonicalize(conn, ctx, vec[0]); - if (!node) - return ENOMEM; - list_for_each_entry(watch, &conn->watches, list) { -diff --git a/tools/xenstore/xenstored_watch.h b/tools/xenstore/xenstored_watch.h -index 0e693f0839cd..091890edca96 100644 ---- a/tools/xenstore/xenstored_watch.h -+++ b/tools/xenstore/xenstored_watch.h -@@ -21,8 +21,10 @@ - - #include "xenstored_core.h" - --int do_watch(struct connection *conn, struct buffered_data *in); --int do_unwatch(struct connection *conn, struct buffered_data *in); -+int do_watch(const void *ctx, struct connection *conn, -+ struct buffered_data *in); -+int do_unwatch(const void *ctx, struct connection *conn, -+ struct buffered_data *in); - - /* Fire all watches: !exact means all the children are affected (ie. rm). */ - void fire_watches(struct connection *conn, const void *tmp, const char *name, --- -2.37.4 - diff --git a/0071-tools-xenstore-fix-checking-node-permissions.patch b/0071-tools-xenstore-fix-checking-node-permissions.patch deleted file mode 100644 index 7cfb08b..0000000 --- a/0071-tools-xenstore-fix-checking-node-permissions.patch +++ /dev/null @@ -1,143 +0,0 @@ -From 036fa8717b316a10b67ea8cf4d5dd200ac2b29af Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:10 +0200 -Subject: [PATCH 71/87] tools/xenstore: fix checking node permissions - -Today chk_domain_generation() is being used to check whether a node -permission entry is still valid or whether it is referring to a domain -no longer existing. This is done by comparing the node's and the -domain's generation count. - -In case no struct domain is existing for a checked domain, but the -domain itself is valid, chk_domain_generation() assumes it is being -called due to the first node created for a new domain and it will -return success. - -This might be wrong in case the checked permission is related to an -old domain, which has just been replaced with a new domain using the -same domid. - -Fix that by letting chk_domain_generation() fail in case a struct -domain isn't found. In order to cover the case of the first node for -a new domain try to allocate the needed struct domain explicitly when -processing the related SET_PERMS command. In case a referenced domain -isn't existing, flag the related permission to be ignored right away. - -This is XSA-417 / CVE-2022-42320. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit ab128218225d3542596ca3a02aee80d55494bef8) ---- - tools/xenstore/xenstored_core.c | 5 +++++ - tools/xenstore/xenstored_domain.c | 37 +++++++++++++++++++++---------- - tools/xenstore/xenstored_domain.h | 1 + - 3 files changed, 31 insertions(+), 12 deletions(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 411cc0e44714..c676ee4e4e4f 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -1757,6 +1757,11 @@ static int do_set_perms(const void *ctx, struct connection *conn, - if (!xs_strings_to_perms(perms.p, perms.num, permstr)) - return errno; - -+ if (domain_alloc_permrefs(&perms) < 0) -+ return ENOMEM; -+ if (perms.p[0].perms & XS_PERM_IGNORE) -+ return ENOENT; -+ - /* First arg is node name. */ - if (strstarts(in->buffer, "@")) { - if (set_perms_special(conn, in->buffer, &perms)) -diff --git a/tools/xenstore/xenstored_domain.c b/tools/xenstore/xenstored_domain.c -index fb732d0a14c3..e2f1b09c6037 100644 ---- a/tools/xenstore/xenstored_domain.c -+++ b/tools/xenstore/xenstored_domain.c -@@ -875,7 +875,6 @@ int domain_entry_inc(struct connection *conn, struct node *node) - * count (used for testing whether a node permission is older than a domain). - * - * Return values: -- * -1: error - * 0: domain has higher generation count (it is younger than a node with the - * given count), or domain isn't existing any longer - * 1: domain is older than the node -@@ -883,20 +882,38 @@ int domain_entry_inc(struct connection *conn, struct node *node) - static int chk_domain_generation(unsigned int domid, uint64_t gen) - { - struct domain *d; -- xc_dominfo_t dominfo; - - if (!xc_handle && domid == 0) - return 1; - - d = find_domain_struct(domid); -- if (d) -- return (d->generation <= gen) ? 1 : 0; - -- if (!get_domain_info(domid, &dominfo)) -- return 0; -+ return (d && d->generation <= gen) ? 1 : 0; -+} - -- d = alloc_domain(NULL, domid); -- return d ? 1 : -1; -+/* -+ * Allocate all missing struct domain referenced by a permission set. -+ * Any permission entries for not existing domains will be marked to be -+ * ignored. -+ */ -+int domain_alloc_permrefs(struct node_perms *perms) -+{ -+ unsigned int i, domid; -+ struct domain *d; -+ xc_dominfo_t dominfo; -+ -+ for (i = 0; i < perms->num; i++) { -+ domid = perms->p[i].id; -+ d = find_domain_struct(domid); -+ if (!d) { -+ if (!get_domain_info(domid, &dominfo)) -+ perms->p[i].perms |= XS_PERM_IGNORE; -+ else if (!alloc_domain(NULL, domid)) -+ return ENOMEM; -+ } -+ } -+ -+ return 0; - } - - /* -@@ -909,8 +926,6 @@ int domain_adjust_node_perms(struct connection *conn, struct node *node) - int ret; - - ret = chk_domain_generation(node->perms.p[0].id, node->generation); -- if (ret < 0) -- return errno; - - /* If the owner doesn't exist any longer give it to priv domain. */ - if (!ret) { -@@ -927,8 +942,6 @@ int domain_adjust_node_perms(struct connection *conn, struct node *node) - continue; - ret = chk_domain_generation(node->perms.p[i].id, - node->generation); -- if (ret < 0) -- return errno; - if (!ret) - node->perms.p[i].perms |= XS_PERM_IGNORE; - } -diff --git a/tools/xenstore/xenstored_domain.h b/tools/xenstore/xenstored_domain.h -index b9e152890149..40fe5f690900 100644 ---- a/tools/xenstore/xenstored_domain.h -+++ b/tools/xenstore/xenstored_domain.h -@@ -62,6 +62,7 @@ bool domain_is_unprivileged(struct connection *conn); - - /* Remove node permissions for no longer existing domains. */ - int domain_adjust_node_perms(struct connection *conn, struct node *node); -+int domain_alloc_permrefs(struct node_perms *perms); - - /* Quota manipulation */ - int domain_entry_inc(struct connection *conn, struct node *); --- -2.37.4 - diff --git a/0072-tools-xenstore-remove-recursion-from-construct_node.patch b/0072-tools-xenstore-remove-recursion-from-construct_node.patch deleted file mode 100644 index 72aebfd..0000000 --- a/0072-tools-xenstore-remove-recursion-from-construct_node.patch +++ /dev/null @@ -1,125 +0,0 @@ -From 074b32e47174a30bb751f2e2c07628eb56117eb8 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:11 +0200 -Subject: [PATCH 72/87] tools/xenstore: remove recursion from construct_node() - -In order to reduce stack usage due to recursion, switch -construct_node() to use a loop instead. - -This is part of XSA-418 / CVE-2022-42321. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit da8ee25d02a5447ba39a9800ee2a710ae1f54222) ---- - tools/xenstore/xenstored_core.c | 86 +++++++++++++++++++++------------ - 1 file changed, 55 insertions(+), 31 deletions(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index c676ee4e4e4f..3907c35643e9 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -1377,45 +1377,69 @@ static int add_child(const void *ctx, struct node *parent, const char *name) - static struct node *construct_node(struct connection *conn, const void *ctx, - const char *name) - { -- struct node *parent, *node; -- char *parentname = get_parent(ctx, name); -+ const char **names = NULL; -+ unsigned int levels = 0; -+ struct node *node = NULL; -+ struct node *parent = NULL; -+ const char *parentname = talloc_strdup(ctx, name); - - if (!parentname) - return NULL; - -- /* If parent doesn't exist, create it. */ -- parent = read_node(conn, parentname, parentname); -- if (!parent && errno == ENOENT) -- parent = construct_node(conn, ctx, parentname); -- if (!parent) -- return NULL; -+ /* Walk the path up until an existing node is found. */ -+ while (!parent) { -+ names = talloc_realloc(ctx, names, const char *, levels + 1); -+ if (!names) -+ goto nomem; - -- /* Add child to parent. */ -- if (add_child(ctx, parent, name)) -- goto nomem; -+ /* -+ * names[0] is the name of the node to construct initially, -+ * names[1] is its parent, and so on. -+ */ -+ names[levels] = parentname; -+ parentname = get_parent(ctx, parentname); -+ if (!parentname) -+ return NULL; - -- /* Allocate node */ -- node = talloc(ctx, struct node); -- if (!node) -- goto nomem; -- node->name = talloc_strdup(node, name); -- if (!node->name) -- goto nomem; -+ /* Try to read parent node until we found an existing one. */ -+ parent = read_node(conn, ctx, parentname); -+ if (!parent && (errno != ENOENT || !strcmp(parentname, "/"))) -+ return NULL; - -- /* Inherit permissions, except unprivileged domains own what they create */ -- node->perms.num = parent->perms.num; -- node->perms.p = talloc_memdup(node, parent->perms.p, -- node->perms.num * sizeof(*node->perms.p)); -- if (!node->perms.p) -- goto nomem; -- if (domain_is_unprivileged(conn)) -- node->perms.p[0].id = conn->id; -+ levels++; -+ } -+ -+ /* Walk the path down again constructing the missing nodes. */ -+ for (; levels > 0; levels--) { -+ /* Add child to parent. */ -+ if (add_child(ctx, parent, names[levels - 1])) -+ goto nomem; -+ -+ /* Allocate node */ -+ node = talloc(ctx, struct node); -+ if (!node) -+ goto nomem; -+ node->name = talloc_steal(node, names[levels - 1]); -+ -+ /* Inherit permissions, unpriv domains own what they create. */ -+ node->perms.num = parent->perms.num; -+ node->perms.p = talloc_memdup(node, parent->perms.p, -+ node->perms.num * -+ sizeof(*node->perms.p)); -+ if (!node->perms.p) -+ goto nomem; -+ if (domain_is_unprivileged(conn)) -+ node->perms.p[0].id = conn->id; -+ -+ /* No children, no data */ -+ node->children = node->data = NULL; -+ node->childlen = node->datalen = 0; -+ node->acc.memory = 0; -+ node->parent = parent; -+ -+ parent = node; -+ } - -- /* No children, no data */ -- node->children = node->data = NULL; -- node->childlen = node->datalen = 0; -- node->acc.memory = 0; -- node->parent = parent; - return node; - - nomem: --- -2.37.4 - diff --git a/0073-tools-xenstore-don-t-let-remove_child_entry-call-cor.patch b/0073-tools-xenstore-don-t-let-remove_child_entry-call-cor.patch deleted file mode 100644 index 3c01eb5..0000000 --- a/0073-tools-xenstore-don-t-let-remove_child_entry-call-cor.patch +++ /dev/null @@ -1,110 +0,0 @@ -From 32ff913afed898e6aef61626a58dc0bf5c6309ef Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:11 +0200 -Subject: [PATCH 73/87] tools/xenstore: don't let remove_child_entry() call - corrupt() - -In case of write_node() returning an error, remove_child_entry() will -call corrupt() today. This could result in an endless recursion, as -remove_child_entry() is called by corrupt(), too: - -corrupt() - check_store() - check_store_() - remove_child_entry() - -Fix that by letting remove_child_entry() return an error instead and -let the caller decide what to do. - -This is part of XSA-418 / CVE-2022-42321. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit 0c00c51f3bc8206c7f9cf87d014650157bee2bf4) ---- - tools/xenstore/xenstored_core.c | 36 ++++++++++++++++++--------------- - 1 file changed, 20 insertions(+), 16 deletions(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 3907c35643e9..f433a45dc217 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -1608,15 +1608,15 @@ static void memdel(void *mem, unsigned off, unsigned len, unsigned total) - memmove(mem + off, mem + off + len, total - off - len); - } - --static void remove_child_entry(struct connection *conn, struct node *node, -- size_t offset) -+static int remove_child_entry(struct connection *conn, struct node *node, -+ size_t offset) - { - size_t childlen = strlen(node->children + offset); - - memdel(node->children, offset, childlen + 1, node->childlen); - node->childlen -= childlen + 1; -- if (write_node(conn, node, true)) -- corrupt(conn, "Can't update parent node '%s'", node->name); -+ -+ return write_node(conn, node, true); - } - - static void delete_child(struct connection *conn, -@@ -1626,7 +1626,9 @@ static void delete_child(struct connection *conn, - - for (i = 0; i < node->childlen; i += strlen(node->children+i) + 1) { - if (streq(node->children+i, childname)) { -- remove_child_entry(conn, node, i); -+ if (remove_child_entry(conn, node, i)) -+ corrupt(conn, "Can't update parent node '%s'", -+ node->name); - return; - } - } -@@ -2325,6 +2327,17 @@ int remember_string(struct hashtable *hash, const char *str) - return hashtable_insert(hash, k, (void *)1); - } - -+static int rm_child_entry(struct node *node, size_t off, size_t len) -+{ -+ if (!recovery) -+ return off; -+ -+ if (remove_child_entry(NULL, node, off)) -+ log("check_store: child entry could not be removed from '%s'", -+ node->name); -+ -+ return off - len - 1; -+} - - /** - * A node has a children field that names the children of the node, separated -@@ -2377,12 +2390,7 @@ static int check_store_(const char *name, struct hashtable *reachable) - if (hashtable_search(children, childname)) { - log("check_store: '%s' is duplicated!", - childname); -- -- if (recovery) { -- remove_child_entry(NULL, node, -- i); -- i -= childlen + 1; -- } -+ i = rm_child_entry(node, i, childlen); - } - else { - if (!remember_string(children, -@@ -2399,11 +2407,7 @@ static int check_store_(const char *name, struct hashtable *reachable) - } else if (errno != ENOMEM) { - log("check_store: No child '%s' found!\n", - childname); -- -- if (recovery) { -- remove_child_entry(NULL, node, i); -- i -= childlen + 1; -- } -+ i = rm_child_entry(node, i, childlen); - } else { - log("check_store: ENOMEM"); - ret = ENOMEM; --- -2.37.4 - diff --git a/0074-tools-xenstore-add-generic-treewalk-function.patch b/0074-tools-xenstore-add-generic-treewalk-function.patch deleted file mode 100644 index d84439c..0000000 --- a/0074-tools-xenstore-add-generic-treewalk-function.patch +++ /dev/null @@ -1,250 +0,0 @@ -From 01ab4910229696e51c59a80eb86d0fedeeccb54b Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:11 +0200 -Subject: [PATCH 74/87] tools/xenstore: add generic treewalk function - -Add a generic function to walk the complete node tree. It will start -at "/" and descend recursively into each child, calling a function -specified by the caller. Depending on the return value of the user -specified function the walk will be aborted, continued, or the current -child will be skipped by not descending into its children. - -This is part of XSA-418 / CVE-2022-42321. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Acked-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit 0d7c5d19bc27492360196e7dad2b227908564fff) ---- - tools/xenstore/xenstored_core.c | 143 +++++++++++++++++++++++++++++--- - tools/xenstore/xenstored_core.h | 40 +++++++++ - 2 files changed, 170 insertions(+), 13 deletions(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index f433a45dc217..2cda3ee375ab 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -1838,6 +1838,135 @@ static int do_set_perms(const void *ctx, struct connection *conn, - return 0; - } - -+static char *child_name(const void *ctx, const char *s1, const char *s2) -+{ -+ if (strcmp(s1, "/")) -+ return talloc_asprintf(ctx, "%s/%s", s1, s2); -+ return talloc_asprintf(ctx, "/%s", s2); -+} -+ -+static int rm_from_parent(struct connection *conn, struct node *parent, -+ const char *name) -+{ -+ size_t off; -+ -+ if (!parent) -+ return WALK_TREE_ERROR_STOP; -+ -+ for (off = parent->childoff - 1; off && parent->children[off - 1]; -+ off--); -+ if (remove_child_entry(conn, parent, off)) { -+ log("treewalk: child entry could not be removed from '%s'", -+ parent->name); -+ return WALK_TREE_ERROR_STOP; -+ } -+ parent->childoff = off; -+ -+ return WALK_TREE_OK; -+} -+ -+static int walk_call_func(const void *ctx, struct connection *conn, -+ struct node *node, struct node *parent, void *arg, -+ int (*func)(const void *ctx, struct connection *conn, -+ struct node *node, void *arg)) -+{ -+ int ret; -+ -+ if (!func) -+ return WALK_TREE_OK; -+ -+ ret = func(ctx, conn, node, arg); -+ if (ret == WALK_TREE_RM_CHILDENTRY && parent) -+ ret = rm_from_parent(conn, parent, node->name); -+ -+ return ret; -+} -+ -+int walk_node_tree(const void *ctx, struct connection *conn, const char *root, -+ struct walk_funcs *funcs, void *arg) -+{ -+ int ret = 0; -+ void *tmpctx; -+ char *name; -+ struct node *node = NULL; -+ struct node *parent = NULL; -+ -+ tmpctx = talloc_new(ctx); -+ if (!tmpctx) { -+ errno = ENOMEM; -+ return WALK_TREE_ERROR_STOP; -+ } -+ name = talloc_strdup(tmpctx, root); -+ if (!name) { -+ errno = ENOMEM; -+ talloc_free(tmpctx); -+ return WALK_TREE_ERROR_STOP; -+ } -+ -+ /* Continue the walk until an error is returned. */ -+ while (ret >= 0) { -+ /* node == NULL possible only for the initial loop iteration. */ -+ if (node) { -+ /* Go one step up if ret or if last child finished. */ -+ if (ret || node->childoff >= node->childlen) { -+ parent = node->parent; -+ /* Call function AFTER processing a node. */ -+ ret = walk_call_func(ctx, conn, node, parent, -+ arg, funcs->exit); -+ /* Last node, so exit loop. */ -+ if (!parent) -+ break; -+ talloc_free(node); -+ /* Continue with parent. */ -+ node = parent; -+ continue; -+ } -+ /* Get next child of current node. */ -+ name = child_name(tmpctx, node->name, -+ node->children + node->childoff); -+ if (!name) { -+ ret = WALK_TREE_ERROR_STOP; -+ break; -+ } -+ /* Point to next child. */ -+ node->childoff += strlen(node->children + -+ node->childoff) + 1; -+ /* Descent into children. */ -+ parent = node; -+ } -+ /* Read next node (root node or next child). */ -+ node = read_node(conn, tmpctx, name); -+ if (!node) { -+ /* Child not found - should not happen! */ -+ /* ENOENT case can be handled by supplied function. */ -+ if (errno == ENOENT && funcs->enoent) -+ ret = funcs->enoent(ctx, conn, parent, name, -+ arg); -+ else -+ ret = WALK_TREE_ERROR_STOP; -+ if (!parent) -+ break; -+ if (ret == WALK_TREE_RM_CHILDENTRY) -+ ret = rm_from_parent(conn, parent, name); -+ if (ret < 0) -+ break; -+ talloc_free(name); -+ node = parent; -+ continue; -+ } -+ talloc_free(name); -+ node->parent = parent; -+ node->childoff = 0; -+ /* Call function BEFORE processing a node. */ -+ ret = walk_call_func(ctx, conn, node, parent, arg, -+ funcs->enter); -+ } -+ -+ talloc_free(tmpctx); -+ -+ return ret < 0 ? ret : WALK_TREE_OK; -+} -+ - static struct { - const char *str; - int (*func)(const void *ctx, struct connection *conn, -@@ -2305,18 +2434,6 @@ static int keys_equal_fn(void *key1, void *key2) - return 0 == strcmp((char *)key1, (char *)key2); - } - -- --static char *child_name(const char *s1, const char *s2) --{ -- if (strcmp(s1, "/")) { -- return talloc_asprintf(NULL, "%s/%s", s1, s2); -- } -- else { -- return talloc_asprintf(NULL, "/%s", s2); -- } --} -- -- - int remember_string(struct hashtable *hash, const char *str) - { - char *k = malloc(strlen(str) + 1); -@@ -2376,7 +2493,7 @@ static int check_store_(const char *name, struct hashtable *reachable) - while (i < node->childlen && !ret) { - struct node *childnode; - size_t childlen = strlen(node->children + i); -- char * childname = child_name(node->name, -+ char * childname = child_name(NULL, node->name, - node->children + i); - - if (!childname) { -diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h -index bfd3fc1e9df3..2d9942171d92 100644 ---- a/tools/xenstore/xenstored_core.h -+++ b/tools/xenstore/xenstored_core.h -@@ -202,6 +202,7 @@ struct node { - - /* Children, each nul-terminated. */ - unsigned int childlen; -+ unsigned int childoff; /* Used by walk_node_tree() internally. */ - char *children; - - /* Allocation information for node currently in store. */ -@@ -338,6 +339,45 @@ void read_state_buffered_data(const void *ctx, struct connection *conn, - const struct xs_state_connection *sc); - void read_state_node(const void *ctx, const void *state); - -+/* -+ * Walk the node tree below root calling funcs->enter() and funcs->exit() for -+ * each node. funcs->enter() is being called when entering a node, so before -+ * any of the children of the node is processed. funcs->exit() is being -+ * called when leaving the node, so after all children have been processed. -+ * funcs->enoent() is being called when a node isn't existing. -+ * funcs->*() return values: -+ * < 0: tree walk is stopped, walk_node_tree() returns funcs->*() return value -+ * in case WALK_TREE_ERROR_STOP is returned, errno should be set -+ * WALK_TREE_OK: tree walk is continuing -+ * WALK_TREE_SKIP_CHILDREN: tree walk won't descend below current node, but -+ * walk continues -+ * WALK_TREE_RM_CHILDENTRY: Remove the child entry from its parent and write -+ * the modified parent node back to the data base, implies to not descend -+ * below the current node, but to continue the walk -+ * funcs->*() is allowed to modify the node it is called for in the data base. -+ * In case funcs->enter() is deleting the node, it must not return WALK_TREE_OK -+ * in order to avoid descending into no longer existing children. -+ */ -+/* Return values for funcs->*() and walk_node_tree(). */ -+#define WALK_TREE_SUCCESS_STOP -100 /* Stop walk early, no error. */ -+#define WALK_TREE_ERROR_STOP -1 /* Stop walk due to error. */ -+#define WALK_TREE_OK 0 /* No error. */ -+/* Return value for funcs->*() only. */ -+#define WALK_TREE_SKIP_CHILDREN 1 /* Don't recurse below current node. */ -+#define WALK_TREE_RM_CHILDENTRY 2 /* Remove child entry from parent. */ -+ -+struct walk_funcs { -+ int (*enter)(const void *ctx, struct connection *conn, -+ struct node *node, void *arg); -+ int (*exit)(const void *ctx, struct connection *conn, -+ struct node *node, void *arg); -+ int (*enoent)(const void *ctx, struct connection *conn, -+ struct node *parent, char *name, void *arg); -+}; -+ -+int walk_node_tree(const void *ctx, struct connection *conn, const char *root, -+ struct walk_funcs *funcs, void *arg); -+ - #endif /* _XENSTORED_CORE_H */ - - /* --- -2.37.4 - diff --git a/0075-tools-xenstore-simplify-check_store.patch b/0075-tools-xenstore-simplify-check_store.patch deleted file mode 100644 index 5d0348f..0000000 --- a/0075-tools-xenstore-simplify-check_store.patch +++ /dev/null @@ -1,114 +0,0 @@ -From c5a76df793c638423e1388528dc679a3e020a477 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:12 +0200 -Subject: [PATCH 75/87] tools/xenstore: simplify check_store() - -check_store() is using a hash table for storing all node names it has -found via walking the tree. Additionally it using another hash table -for all children of a node to detect duplicate child names. - -Simplify that by dropping the second hash table as the first one is -already holding all the needed information. - -This is part of XSA-418 / CVE-2022-42321. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit 70f719f52a220bc5bc987e4dd28e14a7039a176b) ---- - tools/xenstore/xenstored_core.c | 47 +++++++++++---------------------- - 1 file changed, 15 insertions(+), 32 deletions(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 2cda3ee375ab..760f3c16c794 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -2477,50 +2477,34 @@ static int check_store_(const char *name, struct hashtable *reachable) - if (node) { - size_t i = 0; - -- struct hashtable * children = -- create_hashtable(16, hash_from_key_fn, keys_equal_fn); -- if (!children) { -- log("check_store create table: ENOMEM"); -- return ENOMEM; -- } -- - if (!remember_string(reachable, name)) { -- hashtable_destroy(children, 0); - log("check_store: ENOMEM"); - return ENOMEM; - } - - while (i < node->childlen && !ret) { -- struct node *childnode; -+ struct node *childnode = NULL; - size_t childlen = strlen(node->children + i); -- char * childname = child_name(NULL, node->name, -- node->children + i); -+ char *childname = child_name(NULL, node->name, -+ node->children + i); - - if (!childname) { - log("check_store: ENOMEM"); - ret = ENOMEM; - break; - } -+ -+ if (hashtable_search(reachable, childname)) { -+ log("check_store: '%s' is duplicated!", -+ childname); -+ i = rm_child_entry(node, i, childlen); -+ goto next; -+ } -+ - childnode = read_node(NULL, childname, childname); -- -+ - if (childnode) { -- if (hashtable_search(children, childname)) { -- log("check_store: '%s' is duplicated!", -- childname); -- i = rm_child_entry(node, i, childlen); -- } -- else { -- if (!remember_string(children, -- childname)) { -- log("check_store: ENOMEM"); -- talloc_free(childnode); -- talloc_free(childname); -- ret = ENOMEM; -- break; -- } -- ret = check_store_(childname, -- reachable); -- } -+ ret = check_store_(childname, reachable); - } else if (errno != ENOMEM) { - log("check_store: No child '%s' found!\n", - childname); -@@ -2530,19 +2514,18 @@ static int check_store_(const char *name, struct hashtable *reachable) - ret = ENOMEM; - } - -+ next: - talloc_free(childnode); - talloc_free(childname); - i += childlen + 1; - } - -- hashtable_destroy(children, 0 /* Don't free values (they are -- all (void *)1) */); - talloc_free(node); - } else if (errno != ENOMEM) { - /* Impossible, because no database should ever be without the - root, and otherwise, we've just checked in our caller - (which made a recursive call to get here). */ -- -+ - log("check_store: No child '%s' found: impossible!", name); - } else { - log("check_store: ENOMEM"); --- -2.37.4 - diff --git a/0076-tools-xenstore-use-treewalk-for-check_store.patch b/0076-tools-xenstore-use-treewalk-for-check_store.patch deleted file mode 100644 index b965eb0..0000000 --- a/0076-tools-xenstore-use-treewalk-for-check_store.patch +++ /dev/null @@ -1,172 +0,0 @@ -From f5a4c26b2efc55a5267840fcb31f95c00cc25d10 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:12 +0200 -Subject: [PATCH 76/87] tools/xenstore: use treewalk for check_store() - -Instead of doing an open tree walk using call recursion, use -walk_node_tree() when checking the store for inconsistencies. - -This will reduce code size and avoid many nesting levels of function -calls which could potentially exhaust the stack. - -This is part of XSA-418 / CVE-2022-42321. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit a07cc0ec60612f414bedf2bafb26ec38d2602e95) ---- - tools/xenstore/xenstored_core.c | 109 +++++++++----------------------- - 1 file changed, 30 insertions(+), 79 deletions(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 760f3c16c794..efdd1888fd78 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -2444,18 +2444,6 @@ int remember_string(struct hashtable *hash, const char *str) - return hashtable_insert(hash, k, (void *)1); - } - --static int rm_child_entry(struct node *node, size_t off, size_t len) --{ -- if (!recovery) -- return off; -- -- if (remove_child_entry(NULL, node, off)) -- log("check_store: child entry could not be removed from '%s'", -- node->name); -- -- return off - len - 1; --} -- - /** - * A node has a children field that names the children of the node, separated - * by NULs. We check whether there are entries in there that are duplicated -@@ -2469,70 +2457,29 @@ static int rm_child_entry(struct node *node, size_t off, size_t len) - * As we go, we record each node in the given reachable hashtable. These - * entries will be used later in clean_store. - */ --static int check_store_(const char *name, struct hashtable *reachable) -+static int check_store_step(const void *ctx, struct connection *conn, -+ struct node *node, void *arg) - { -- struct node *node = read_node(NULL, name, name); -- int ret = 0; -+ struct hashtable *reachable = arg; - -- if (node) { -- size_t i = 0; -- -- if (!remember_string(reachable, name)) { -- log("check_store: ENOMEM"); -- return ENOMEM; -- } -- -- while (i < node->childlen && !ret) { -- struct node *childnode = NULL; -- size_t childlen = strlen(node->children + i); -- char *childname = child_name(NULL, node->name, -- node->children + i); -- -- if (!childname) { -- log("check_store: ENOMEM"); -- ret = ENOMEM; -- break; -- } -- -- if (hashtable_search(reachable, childname)) { -- log("check_store: '%s' is duplicated!", -- childname); -- i = rm_child_entry(node, i, childlen); -- goto next; -- } -- -- childnode = read_node(NULL, childname, childname); -- -- if (childnode) { -- ret = check_store_(childname, reachable); -- } else if (errno != ENOMEM) { -- log("check_store: No child '%s' found!\n", -- childname); -- i = rm_child_entry(node, i, childlen); -- } else { -- log("check_store: ENOMEM"); -- ret = ENOMEM; -- } -- -- next: -- talloc_free(childnode); -- talloc_free(childname); -- i += childlen + 1; -- } -- -- talloc_free(node); -- } else if (errno != ENOMEM) { -- /* Impossible, because no database should ever be without the -- root, and otherwise, we've just checked in our caller -- (which made a recursive call to get here). */ -- -- log("check_store: No child '%s' found: impossible!", name); -- } else { -- log("check_store: ENOMEM"); -- ret = ENOMEM; -+ if (hashtable_search(reachable, (void *)node->name)) { -+ log("check_store: '%s' is duplicated!", node->name); -+ return recovery ? WALK_TREE_RM_CHILDENTRY -+ : WALK_TREE_SKIP_CHILDREN; - } - -- return ret; -+ if (!remember_string(reachable, node->name)) -+ return WALK_TREE_ERROR_STOP; -+ -+ return WALK_TREE_OK; -+} -+ -+static int check_store_enoent(const void *ctx, struct connection *conn, -+ struct node *parent, char *name, void *arg) -+{ -+ log("check_store: node '%s' not found", name); -+ -+ return recovery ? WALK_TREE_RM_CHILDENTRY : WALK_TREE_OK; - } - - -@@ -2581,24 +2528,28 @@ static void clean_store(struct hashtable *reachable) - - void check_store(void) - { -- char * root = talloc_strdup(NULL, "/"); -- struct hashtable * reachable = -- create_hashtable(16, hash_from_key_fn, keys_equal_fn); -- -+ struct hashtable *reachable; -+ struct walk_funcs walkfuncs = { -+ .enter = check_store_step, -+ .enoent = check_store_enoent, -+ }; -+ -+ reachable = create_hashtable(16, hash_from_key_fn, keys_equal_fn); - if (!reachable) { - log("check_store: ENOMEM"); - return; - } - - log("Checking store ..."); -- if (!check_store_(root, reachable) && -- !check_transactions(reachable)) -+ if (walk_node_tree(NULL, NULL, "/", &walkfuncs, reachable)) { -+ if (errno == ENOMEM) -+ log("check_store: ENOMEM"); -+ } else if (!check_transactions(reachable)) - clean_store(reachable); - log("Checking store complete."); - - hashtable_destroy(reachable, 0 /* Don't free values (they are all - (void *)1) */); -- talloc_free(root); - } - - --- -2.37.4 - diff --git a/0077-tools-xenstore-use-treewalk-for-deleting-nodes.patch b/0077-tools-xenstore-use-treewalk-for-deleting-nodes.patch deleted file mode 100644 index 6d80a4d..0000000 --- a/0077-tools-xenstore-use-treewalk-for-deleting-nodes.patch +++ /dev/null @@ -1,180 +0,0 @@ -From 1514de3a5f23aef451133367d8dc04a26b88052f Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:12 +0200 -Subject: [PATCH 77/87] tools/xenstore: use treewalk for deleting nodes - -Instead of doing an open tree walk using call recursion, use -walk_node_tree() when deleting a sub-tree of nodes. - -This will reduce code size and avoid many nesting levels of function -calls which could potentially exhaust the stack. - -This is part of XSA-418 / CVE-2022-42321. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Acked-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit ea16962053a6849a6e7cada549ba7f8c586d85c6) ---- - tools/xenstore/xenstored_core.c | 99 ++++++++++++++------------------- - 1 file changed, 43 insertions(+), 56 deletions(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index efdd1888fd78..58fb651542ec 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -1334,21 +1334,6 @@ static int do_read(const void *ctx, struct connection *conn, - return 0; - } - --static void delete_node_single(struct connection *conn, struct node *node) --{ -- TDB_DATA key; -- -- if (access_node(conn, node, NODE_ACCESS_DELETE, &key)) -- return; -- -- if (do_tdb_delete(conn, &key, &node->acc) != 0) { -- corrupt(conn, "Could not delete '%s'", node->name); -- return; -- } -- -- domain_entry_dec(conn, node); --} -- - /* Must not be / */ - static char *basename(const char *name) - { -@@ -1619,69 +1604,59 @@ static int remove_child_entry(struct connection *conn, struct node *node, - return write_node(conn, node, true); - } - --static void delete_child(struct connection *conn, -- struct node *node, const char *childname) -+static int delete_child(struct connection *conn, -+ struct node *node, const char *childname) - { - unsigned int i; - - for (i = 0; i < node->childlen; i += strlen(node->children+i) + 1) { - if (streq(node->children+i, childname)) { -- if (remove_child_entry(conn, node, i)) -- corrupt(conn, "Can't update parent node '%s'", -- node->name); -- return; -+ errno = remove_child_entry(conn, node, i) ? EIO : 0; -+ return errno; - } - } - corrupt(conn, "Can't find child '%s' in %s", childname, node->name); -+ -+ errno = EIO; -+ return errno; - } - --static int delete_node(struct connection *conn, const void *ctx, -- struct node *parent, struct node *node, bool watch_exact) -+static int delnode_sub(const void *ctx, struct connection *conn, -+ struct node *node, void *arg) - { -- char *name; -+ const char *root = arg; -+ bool watch_exact; -+ int ret; -+ TDB_DATA key; - -- /* Delete children. */ -- while (node->childlen) { -- struct node *child; -+ /* Any error here will probably be repeated for all following calls. */ -+ ret = access_node(conn, node, NODE_ACCESS_DELETE, &key); -+ if (ret > 0) -+ return WALK_TREE_SUCCESS_STOP; - -- name = talloc_asprintf(node, "%s/%s", node->name, -- node->children); -- child = name ? read_node(conn, node, name) : NULL; -- if (child) { -- if (delete_node(conn, ctx, node, child, true)) -- return errno; -- } else { -- trace("delete_node: Error deleting child '%s/%s'!\n", -- node->name, node->children); -- /* Quit deleting. */ -- errno = ENOMEM; -- return errno; -- } -- talloc_free(name); -- } -+ /* In case of error stop the walk. */ -+ if (!ret && do_tdb_delete(conn, &key, &node->acc)) -+ return WALK_TREE_SUCCESS_STOP; - - /* - * Fire the watches now, when we can still see the node permissions. - * This fine as we are single threaded and the next possible read will - * be handled only after the node has been really removed. -- */ -+ */ -+ watch_exact = strcmp(root, node->name); - fire_watches(conn, ctx, node->name, node, watch_exact, NULL); -- delete_node_single(conn, node); -- delete_child(conn, parent, basename(node->name)); -- talloc_free(node); - -- return 0; -+ domain_entry_dec(conn, node); -+ -+ return WALK_TREE_RM_CHILDENTRY; - } - --static int _rm(struct connection *conn, const void *ctx, struct node *node, -- const char *name) -+static int _rm(struct connection *conn, const void *ctx, const char *name) - { -- /* -- * Deleting node by node, so the result is always consistent even in -- * case of a failure. -- */ - struct node *parent; - char *parentname = get_parent(ctx, name); -+ struct walk_funcs walkfuncs = { .exit = delnode_sub }; -+ int ret; - - if (!parentname) - return errno; -@@ -1689,9 +1664,21 @@ static int _rm(struct connection *conn, const void *ctx, struct node *node, - parent = read_node(conn, ctx, parentname); - if (!parent) - return read_node_can_propagate_errno() ? errno : EINVAL; -- node->parent = parent; - -- return delete_node(conn, ctx, parent, node, false); -+ ret = walk_node_tree(ctx, conn, name, &walkfuncs, (void *)name); -+ if (ret < 0) { -+ if (ret == WALK_TREE_ERROR_STOP) { -+ corrupt(conn, "error when deleting sub-nodes of %s\n", -+ name); -+ errno = EIO; -+ } -+ return errno; -+ } -+ -+ if (delete_child(conn, parent, basename(name))) -+ return errno; -+ -+ return 0; - } - - -@@ -1728,7 +1715,7 @@ static int do_rm(const void *ctx, struct connection *conn, - if (streq(name, "/")) - return EINVAL; - -- ret = _rm(conn, ctx, node, name); -+ ret = _rm(conn, ctx, name); - if (ret) - return ret; - --- -2.37.4 - diff --git a/0078-tools-xenstore-use-treewalk-for-creating-node-record.patch b/0078-tools-xenstore-use-treewalk-for-creating-node-record.patch deleted file mode 100644 index d5ed8c1..0000000 --- a/0078-tools-xenstore-use-treewalk-for-creating-node-record.patch +++ /dev/null @@ -1,169 +0,0 @@ -From 7682de61a49f7692cbd31a62f12c0ca12e069575 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:12 +0200 -Subject: [PATCH 78/87] tools/xenstore: use treewalk for creating node records - -Instead of doing an open tree walk using call recursion, use -walk_node_tree() when creating the node records during a live update. - -This will reduce code size and avoid many nesting levels of function -calls which could potentially exhaust the stack. - -This is part of XSA-418 / CVE-2022-42321. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit 297ac246a5d8ed656b349641288f3402dcc0251e) ---- - tools/xenstore/xenstored_core.c | 105 ++++++++++++-------------------- - 1 file changed, 40 insertions(+), 65 deletions(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 58fb651542ec..05d349778bb4 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -3120,101 +3120,76 @@ const char *dump_state_node_perms(FILE *fp, const struct xs_permissions *perms, - return NULL; - } - --static const char *dump_state_node_tree(FILE *fp, char *path, -- unsigned int path_max_len) -+struct dump_node_data { -+ FILE *fp; -+ const char *err; -+}; -+ -+static int dump_state_node_err(struct dump_node_data *data, const char *err) - { -- unsigned int pathlen, childlen, p = 0; -+ data->err = err; -+ return WALK_TREE_ERROR_STOP; -+} -+ -+static int dump_state_node(const void *ctx, struct connection *conn, -+ struct node *node, void *arg) -+{ -+ struct dump_node_data *data = arg; -+ FILE *fp = data->fp; -+ unsigned int pathlen; - struct xs_state_record_header head; - struct xs_state_node sn; -- TDB_DATA key, data; -- const struct xs_tdb_record_hdr *hdr; -- const char *child; - const char *ret; - -- pathlen = strlen(path) + 1; -- -- set_tdb_key(path, &key); -- data = tdb_fetch(tdb_ctx, key); -- if (data.dptr == NULL) -- return "Error reading node"; -- -- /* Clean up in case of failure. */ -- talloc_steal(path, data.dptr); -- -- hdr = (void *)data.dptr; -+ pathlen = strlen(node->name) + 1; - - head.type = XS_STATE_TYPE_NODE; - head.length = sizeof(sn); - sn.conn_id = 0; - sn.ta_id = 0; - sn.ta_access = 0; -- sn.perm_n = hdr->num_perms; -+ sn.perm_n = node->perms.num; - sn.path_len = pathlen; -- sn.data_len = hdr->datalen; -- head.length += hdr->num_perms * sizeof(*sn.perms); -+ sn.data_len = node->datalen; -+ head.length += node->perms.num * sizeof(*sn.perms); - head.length += pathlen; -- head.length += hdr->datalen; -+ head.length += node->datalen; - head.length = ROUNDUP(head.length, 3); - - if (fwrite(&head, sizeof(head), 1, fp) != 1) -- return "Dump node state error"; -+ return dump_state_node_err(data, "Dump node head error"); - if (fwrite(&sn, sizeof(sn), 1, fp) != 1) -- return "Dump node state error"; -+ return dump_state_node_err(data, "Dump node state error"); - -- ret = dump_state_node_perms(fp, hdr->perms, hdr->num_perms); -+ ret = dump_state_node_perms(fp, node->perms.p, node->perms.num); - if (ret) -- return ret; -+ return dump_state_node_err(data, ret); - -- if (fwrite(path, pathlen, 1, fp) != 1) -- return "Dump node path error"; -- if (hdr->datalen && -- fwrite(hdr->perms + hdr->num_perms, hdr->datalen, 1, fp) != 1) -- return "Dump node data error"; -+ if (fwrite(node->name, pathlen, 1, fp) != 1) -+ return dump_state_node_err(data, "Dump node path error"); -+ -+ if (node->datalen && fwrite(node->data, node->datalen, 1, fp) != 1) -+ return dump_state_node_err(data, "Dump node data error"); - - ret = dump_state_align(fp); - if (ret) -- return ret; -+ return dump_state_node_err(data, ret); - -- child = (char *)(hdr->perms + hdr->num_perms) + hdr->datalen; -- -- /* -- * Use path for constructing children paths. -- * As we don't write out nodes without having written their parent -- * already we will never clobber a part of the path we'll need later. -- */ -- pathlen--; -- if (path[pathlen - 1] != '/') { -- path[pathlen] = '/'; -- pathlen++; -- } -- while (p < hdr->childlen) { -- childlen = strlen(child) + 1; -- if (pathlen + childlen > path_max_len) -- return "Dump node path length error"; -- strcpy(path + pathlen, child); -- ret = dump_state_node_tree(fp, path, path_max_len); -- if (ret) -- return ret; -- p += childlen; -- child += childlen; -- } -- -- talloc_free(data.dptr); -- -- return NULL; -+ return WALK_TREE_OK; - } - - const char *dump_state_nodes(FILE *fp, const void *ctx) - { -- char *path; -+ struct dump_node_data data = { -+ .fp = fp, -+ .err = "Dump node walk error" -+ }; -+ struct walk_funcs walkfuncs = { .enter = dump_state_node }; - -- path = talloc_size(ctx, XENSTORE_ABS_PATH_MAX + 1); -- if (!path) -- return "Path buffer allocation error"; -+ if (walk_node_tree(ctx, NULL, "/", &walkfuncs, &data)) -+ return data.err; - -- strcpy(path, "/"); -- -- return dump_state_node_tree(fp, path, XENSTORE_ABS_PATH_MAX + 1); -+ return NULL; - } - - void read_state_global(const void *ctx, const void *state) --- -2.37.4 - diff --git a/0079-tools-xenstore-remove-nodes-owned-by-destroyed-domai.patch b/0079-tools-xenstore-remove-nodes-owned-by-destroyed-domai.patch deleted file mode 100644 index f6ba349..0000000 --- a/0079-tools-xenstore-remove-nodes-owned-by-destroyed-domai.patch +++ /dev/null @@ -1,298 +0,0 @@ -From 825332daeac9fc3ac1e482e805ac4a3bc1e1ab34 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:12 +0200 -Subject: [PATCH 79/87] tools/xenstore: remove nodes owned by destroyed domain - -In case a domain is removed from Xenstore, remove all nodes owned by -it per default. - -This tackles the problem that nodes might be created by a domain -outside its home path in Xenstore, leading to Xenstore hogging more -and more memory. Domain quota don't work in this case if the guest is -rebooting in between. - -Since XSA-322 ownership of such stale nodes is transferred to dom0, -which is helping against unintended access, but not against OOM of -Xenstore. - -As a fallback for weird cases add a Xenstore start parameter for -keeping today's way to handle stale nodes, adding the risk of Xenstore -hitting an OOM situation. - -This is part of XSA-419 / CVE-2022-42322. - -Fixes: 496306324d8d ("tools/xenstore: revoke access rights for removed domains") -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit 755d3f9debf8879448211fffb018f556136f6a79) ---- - tools/xenstore/xenstored_core.c | 17 +++++-- - tools/xenstore/xenstored_core.h | 4 ++ - tools/xenstore/xenstored_domain.c | 84 +++++++++++++++++++++++-------- - tools/xenstore/xenstored_domain.h | 2 +- - 4 files changed, 80 insertions(+), 27 deletions(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 05d349778bb4..0ca1a5a19ac2 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -80,6 +80,7 @@ static bool verbose = false; - LIST_HEAD(connections); - int tracefd = -1; - static bool recovery = true; -+bool keep_orphans = false; - static int reopen_log_pipe[2]; - static int reopen_log_pipe0_pollfd_idx = -1; - char *tracefile = NULL; -@@ -757,7 +758,7 @@ struct node *read_node(struct connection *conn, const void *ctx, - node->perms.p = hdr->perms; - node->acc.domid = node->perms.p[0].id; - node->acc.memory = data.dsize; -- if (domain_adjust_node_perms(conn, node)) -+ if (domain_adjust_node_perms(node)) - goto error; - - /* If owner is gone reset currently accounted memory size. */ -@@ -800,7 +801,7 @@ int write_node_raw(struct connection *conn, TDB_DATA *key, struct node *node, - void *p; - struct xs_tdb_record_hdr *hdr; - -- if (domain_adjust_node_perms(conn, node)) -+ if (domain_adjust_node_perms(node)) - return errno; - - data.dsize = sizeof(*hdr) -@@ -1651,7 +1652,7 @@ static int delnode_sub(const void *ctx, struct connection *conn, - return WALK_TREE_RM_CHILDENTRY; - } - --static int _rm(struct connection *conn, const void *ctx, const char *name) -+int rm_node(struct connection *conn, const void *ctx, const char *name) - { - struct node *parent; - char *parentname = get_parent(ctx, name); -@@ -1715,7 +1716,7 @@ static int do_rm(const void *ctx, struct connection *conn, - if (streq(name, "/")) - return EINVAL; - -- ret = _rm(conn, ctx, name); -+ ret = rm_node(conn, ctx, name); - if (ret) - return ret; - -@@ -2639,6 +2640,8 @@ static void usage(void) - " -R, --no-recovery to request that no recovery should be attempted when\n" - " the store is corrupted (debug only),\n" - " -I, --internal-db store database in memory, not on disk\n" -+" -K, --keep-orphans don't delete nodes owned by a domain when the\n" -+" domain is deleted (this is a security risk!)\n" - " -V, --verbose to request verbose execution.\n"); - } - -@@ -2663,6 +2666,7 @@ static struct option options[] = { - { "timeout", 1, NULL, 'w' }, - { "no-recovery", 0, NULL, 'R' }, - { "internal-db", 0, NULL, 'I' }, -+ { "keep-orphans", 0, NULL, 'K' }, - { "verbose", 0, NULL, 'V' }, - { "watch-nb", 1, NULL, 'W' }, - #ifndef NO_LIVE_UPDATE -@@ -2742,7 +2746,7 @@ int main(int argc, char *argv[]) - orig_argc = argc; - orig_argv = argv; - -- while ((opt = getopt_long(argc, argv, "DE:F:HNPS:t:A:M:Q:q:T:RVW:w:U", -+ while ((opt = getopt_long(argc, argv, "DE:F:HKNPS:t:A:M:Q:q:T:RVW:w:U", - options, NULL)) != -1) { - switch (opt) { - case 'D': -@@ -2778,6 +2782,9 @@ int main(int argc, char *argv[]) - case 'I': - tdb_flags = TDB_INTERNAL|TDB_NOLOCK; - break; -+ case 'K': -+ keep_orphans = true; -+ break; - case 'V': - verbose = true; - break; -diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h -index 2d9942171d92..725793257e4a 100644 ---- a/tools/xenstore/xenstored_core.h -+++ b/tools/xenstore/xenstored_core.h -@@ -240,6 +240,9 @@ int write_node_raw(struct connection *conn, TDB_DATA *key, struct node *node, - struct node *read_node(struct connection *conn, const void *ctx, - const char *name); - -+/* Remove a node and its children. */ -+int rm_node(struct connection *conn, const void *ctx, const char *name); -+ - void setup_structure(bool live_update); - struct connection *new_connection(const struct interface_funcs *funcs); - struct connection *get_connection_by_id(unsigned int conn_id); -@@ -286,6 +289,7 @@ extern int quota_req_outstanding; - extern int quota_trans_nodes; - extern int quota_memory_per_domain_soft; - extern int quota_memory_per_domain_hard; -+extern bool keep_orphans; - - extern unsigned int timeout_watch_event_msec; - -diff --git a/tools/xenstore/xenstored_domain.c b/tools/xenstore/xenstored_domain.c -index e2f1b09c6037..8b134017a27a 100644 ---- a/tools/xenstore/xenstored_domain.c -+++ b/tools/xenstore/xenstored_domain.c -@@ -227,10 +227,64 @@ static void unmap_interface(void *interface) - xengnttab_unmap(*xgt_handle, interface, 1); - } - -+static int domain_tree_remove_sub(const void *ctx, struct connection *conn, -+ struct node *node, void *arg) -+{ -+ struct domain *domain = arg; -+ TDB_DATA key; -+ int ret = WALK_TREE_OK; -+ -+ if (node->perms.p[0].id != domain->domid) -+ return WALK_TREE_OK; -+ -+ if (keep_orphans) { -+ set_tdb_key(node->name, &key); -+ domain->nbentry--; -+ node->perms.p[0].id = priv_domid; -+ node->acc.memory = 0; -+ domain_entry_inc(NULL, node); -+ if (write_node_raw(NULL, &key, node, true)) { -+ /* That's unfortunate. We only can try to continue. */ -+ syslog(LOG_ERR, -+ "error when moving orphaned node %s to dom0\n", -+ node->name); -+ } else -+ trace("orphaned node %s moved to dom0\n", node->name); -+ } else { -+ if (rm_node(NULL, ctx, node->name)) { -+ /* That's unfortunate. We only can try to continue. */ -+ syslog(LOG_ERR, -+ "error when deleting orphaned node %s\n", -+ node->name); -+ } else -+ trace("orphaned node %s deleted\n", node->name); -+ -+ /* Skip children in all cases in order to avoid more errors. */ -+ ret = WALK_TREE_SKIP_CHILDREN; -+ } -+ -+ return domain->nbentry > 0 ? ret : WALK_TREE_SUCCESS_STOP; -+} -+ -+static void domain_tree_remove(struct domain *domain) -+{ -+ int ret; -+ struct walk_funcs walkfuncs = { .enter = domain_tree_remove_sub }; -+ -+ if (domain->nbentry > 0) { -+ ret = walk_node_tree(domain, NULL, "/", &walkfuncs, domain); -+ if (ret == WALK_TREE_ERROR_STOP) -+ syslog(LOG_ERR, -+ "error when looking for orphaned nodes\n"); -+ } -+} -+ - static int destroy_domain(void *_domain) - { - struct domain *domain = _domain; - -+ domain_tree_remove(domain); -+ - list_del(&domain->list); - - if (!domain->introduced) -@@ -851,15 +905,15 @@ int domain_entry_inc(struct connection *conn, struct node *node) - struct domain *d; - unsigned int domid; - -- if (!conn) -+ if (!node->perms.p) - return 0; - -- domid = node->perms.p ? node->perms.p[0].id : conn->id; -+ domid = node->perms.p[0].id; - -- if (conn->transaction) { -+ if (conn && conn->transaction) { - transaction_entry_inc(conn->transaction, domid); - } else { -- d = (domid == conn->id && conn->domain) ? conn->domain -+ d = (conn && domid == conn->id && conn->domain) ? conn->domain - : find_or_alloc_existing_domain(domid); - if (d) - d->nbentry++; -@@ -920,23 +974,11 @@ int domain_alloc_permrefs(struct node_perms *perms) - * Remove permissions for no longer existing domains in order to avoid a new - * domain with the same domid inheriting the permissions. - */ --int domain_adjust_node_perms(struct connection *conn, struct node *node) -+int domain_adjust_node_perms(struct node *node) - { - unsigned int i; - int ret; - -- ret = chk_domain_generation(node->perms.p[0].id, node->generation); -- -- /* If the owner doesn't exist any longer give it to priv domain. */ -- if (!ret) { -- /* -- * In theory we'd need to update the number of dom0 nodes here, -- * but we could be called for a read of the node. So better -- * avoid the risk to overflow the node count of dom0. -- */ -- node->perms.p[0].id = priv_domid; -- } -- - for (i = 1; i < node->perms.num; i++) { - if (node->perms.p[i].perms & XS_PERM_IGNORE) - continue; -@@ -954,15 +996,15 @@ void domain_entry_dec(struct connection *conn, struct node *node) - struct domain *d; - unsigned int domid; - -- if (!conn) -+ if (!node->perms.p) - return; - - domid = node->perms.p ? node->perms.p[0].id : conn->id; - -- if (conn->transaction) { -+ if (conn && conn->transaction) { - transaction_entry_dec(conn->transaction, domid); - } else { -- d = (domid == conn->id && conn->domain) ? conn->domain -+ d = (conn && domid == conn->id && conn->domain) ? conn->domain - : find_domain_struct(domid); - if (d) { - d->nbentry--; -@@ -1081,7 +1123,7 @@ int domain_memory_add(unsigned int domid, int mem, bool no_quota_check) - * exist, as accounting is done either for a domain related to - * the current connection, or for the domain owning a node - * (which is always existing, as the owner of the node is -- * tested to exist and replaced by domid 0 if not). -+ * tested to exist and deleted or replaced by domid 0 if not). - * So not finding the related domain MUST be an error in the - * data base. - */ -diff --git a/tools/xenstore/xenstored_domain.h b/tools/xenstore/xenstored_domain.h -index 40fe5f690900..5454e925ad15 100644 ---- a/tools/xenstore/xenstored_domain.h -+++ b/tools/xenstore/xenstored_domain.h -@@ -61,7 +61,7 @@ const char *get_implicit_path(const struct connection *conn); - bool domain_is_unprivileged(struct connection *conn); - - /* Remove node permissions for no longer existing domains. */ --int domain_adjust_node_perms(struct connection *conn, struct node *node); -+int domain_adjust_node_perms(struct node *node); - int domain_alloc_permrefs(struct node_perms *perms); - - /* Quota manipulation */ --- -2.37.4 - diff --git a/0080-tools-xenstore-make-the-internal-memory-data-base-th.patch b/0080-tools-xenstore-make-the-internal-memory-data-base-th.patch deleted file mode 100644 index 53d6227..0000000 --- a/0080-tools-xenstore-make-the-internal-memory-data-base-th.patch +++ /dev/null @@ -1,101 +0,0 @@ -From 8b81fc185ab13feca2f63eda3792189e5ac11a97 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:13 +0200 -Subject: [PATCH 80/87] tools/xenstore: make the internal memory data base the - default - -Having a file backed data base has the only advantage of being capable -to dump the contents of it while Xenstore is running, and potentially -using less swap space in case the data base can't be kept in memory. - -It has the major disadvantage of a huge performance overhead: switching -to keep the data base in memory only speeds up live update of xenstored -with 120000 nodes from 20 minutes to 11 seconds. A complete tree walk -of this configuration will be reduced from 7 seconds to 280 msecs -(measured by "xenstore-control check"). - -So make the internal memory data base the default and enhance the -"--internal-db" command line parameter to take an optional parameter -allowing to switch the internal data base back to the file based one. - -This is part of XSA-419. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit d174fefa90487ddd25ebc618028f67b2e8a1f795) ---- - tools/helpers/init-xenstore-domain.c | 4 ++-- - tools/xenstore/xenstored_core.c | 13 ++++++++----- - 2 files changed, 10 insertions(+), 7 deletions(-) - -diff --git a/tools/helpers/init-xenstore-domain.c b/tools/helpers/init-xenstore-domain.c -index 11ebf79e6d26..8d1d1a4f1e3a 100644 ---- a/tools/helpers/init-xenstore-domain.c -+++ b/tools/helpers/init-xenstore-domain.c -@@ -223,9 +223,9 @@ static int build(xc_interface *xch) - } - - if ( param ) -- snprintf(cmdline, 512, "--event %d --internal-db %s", rv, param); -+ snprintf(cmdline, 512, "--event %d %s", rv, param); - else -- snprintf(cmdline, 512, "--event %d --internal-db", rv); -+ snprintf(cmdline, 512, "--event %d", rv); - - dom->guest_domid = domid; - dom->cmdline = xc_dom_strdup(dom, cmdline); -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 0ca1a5a19ac2..041124d8b7a5 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -2329,7 +2329,7 @@ static void accept_connection(int sock) - } - #endif - --static int tdb_flags; -+static int tdb_flags = TDB_INTERNAL | TDB_NOLOCK; - - /* We create initial nodes manually. */ - static void manual_node(const char *name, const char *child) -@@ -2639,7 +2639,8 @@ static void usage(void) - " watch-event: time a watch-event is kept pending\n" - " -R, --no-recovery to request that no recovery should be attempted when\n" - " the store is corrupted (debug only),\n" --" -I, --internal-db store database in memory, not on disk\n" -+" -I, --internal-db [on|off] store database in memory, not on disk, default is\n" -+" memory, with \"--internal-db off\" it is on disk\n" - " -K, --keep-orphans don't delete nodes owned by a domain when the\n" - " domain is deleted (this is a security risk!)\n" - " -V, --verbose to request verbose execution.\n"); -@@ -2665,7 +2666,7 @@ static struct option options[] = { - { "quota-soft", 1, NULL, 'q' }, - { "timeout", 1, NULL, 'w' }, - { "no-recovery", 0, NULL, 'R' }, -- { "internal-db", 0, NULL, 'I' }, -+ { "internal-db", 2, NULL, 'I' }, - { "keep-orphans", 0, NULL, 'K' }, - { "verbose", 0, NULL, 'V' }, - { "watch-nb", 1, NULL, 'W' }, -@@ -2746,7 +2747,8 @@ int main(int argc, char *argv[]) - orig_argc = argc; - orig_argv = argv; - -- while ((opt = getopt_long(argc, argv, "DE:F:HKNPS:t:A:M:Q:q:T:RVW:w:U", -+ while ((opt = getopt_long(argc, argv, -+ "DE:F:HI::KNPS:t:A:M:Q:q:T:RVW:w:U", - options, NULL)) != -1) { - switch (opt) { - case 'D': -@@ -2780,7 +2782,8 @@ int main(int argc, char *argv[]) - tracefile = optarg; - break; - case 'I': -- tdb_flags = TDB_INTERNAL|TDB_NOLOCK; -+ if (optarg && !strcmp(optarg, "off")) -+ tdb_flags = 0; - break; - case 'K': - keep_orphans = true; --- -2.37.4 - diff --git a/0081-docs-enhance-xenstore.txt-with-permissions-descripti.patch b/0081-docs-enhance-xenstore.txt-with-permissions-descripti.patch deleted file mode 100644 index c0b9c4a..0000000 --- a/0081-docs-enhance-xenstore.txt-with-permissions-descripti.patch +++ /dev/null @@ -1,50 +0,0 @@ -From 1f5b394d6ed0ee26b5878bd0cdf4a698bbc4294f Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:13 +0200 -Subject: [PATCH 81/87] docs: enhance xenstore.txt with permissions description -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -The permission scheme of Xenstore nodes is not really covered by -docs/misc/xenstore.txt, other than referring to the Xen wiki. - -Add a paragraph explaining the permissions of nodes, and especially -mentioning removal of nodes when a domain has been removed from -Xenstore. - -This is part of XSA-419. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Edwin Török <edvin.torok@citrix.com> -Acked-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit d084d2c6dff7044956ebdf83a259ad6081a1d921) ---- - docs/misc/xenstore.txt | 11 +++++++++++ - 1 file changed, 11 insertions(+) - -diff --git a/docs/misc/xenstore.txt b/docs/misc/xenstore.txt -index a7d006519ae8..eccd596ee38c 100644 ---- a/docs/misc/xenstore.txt -+++ b/docs/misc/xenstore.txt -@@ -43,6 +43,17 @@ bytes are forbidden; clients specifying relative paths should keep - them to within 2048 bytes. (See XENSTORE_*_PATH_MAX in xs_wire.h.) - - -+Each node has one or multiple permission entries. Permissions are -+granted by domain-id, the first permission entry of each node specifies -+the owner of the node. Permissions of a node can be changed by the -+owner of the node, the owner can only be modified by the control -+domain (usually domain id 0). The owner always has the right to read -+and write the node, while other permissions can be setup to allow -+read and/or write access. When a domain is being removed from Xenstore -+nodes owned by that domain will be removed together with all of those -+nodes' children. -+ -+ - Communication with xenstore is via either sockets, or event channel - and shared memory, as specified in io/xs_wire.h: each message in - either direction is a header formatted as a struct xsd_sockmsg --- -2.37.4 - diff --git a/0082-tools-ocaml-xenstored-Fix-quota-bypass-on-domain-shu.patch b/0082-tools-ocaml-xenstored-Fix-quota-bypass-on-domain-shu.patch deleted file mode 100644 index 1cdc2b2..0000000 --- a/0082-tools-ocaml-xenstored-Fix-quota-bypass-on-domain-shu.patch +++ /dev/null @@ -1,93 +0,0 @@ -From 5b0919f2c0e5060f6e0bc328f100abae0a9f07b8 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= <edvin.torok@citrix.com> -Date: Wed, 12 Oct 2022 19:13:06 +0100 -Subject: [PATCH 82/87] tools/ocaml/xenstored: Fix quota bypass on domain - shutdown -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -XSA-322 fixed a domid reuse vulnerability by assigning Dom0 as the owner of -any nodes left after a domain is shutdown (e.g. outside its /local/domain/N -tree). - -However Dom0 has no quota on purpose, so this opened up another potential -attack vector. Avoid it by deleting these nodes instead of assigning them to -Dom0. - -This is part of XSA-419 / CVE-2022-42323. - -Fixes: c46eff921209 ("tools/ocaml/xenstored: clean up permissions for dead domains") -Signed-off-by: Edwin Török <edvin.torok@citrix.com> -Acked-by: Christian Lindig <christian.lindig@citrix.com> -(cherry picked from commit db471408edd46af403b8bd44d180a928ad7fbb80) ---- - tools/ocaml/xenstored/perms.ml | 3 +-- - tools/ocaml/xenstored/store.ml | 29 +++++++++++++++++++++-------- - 2 files changed, 22 insertions(+), 10 deletions(-) - -diff --git a/tools/ocaml/xenstored/perms.ml b/tools/ocaml/xenstored/perms.ml -index e8a16221f8fa..84f2503e8e29 100644 ---- a/tools/ocaml/xenstored/perms.ml -+++ b/tools/ocaml/xenstored/perms.ml -@@ -64,8 +64,7 @@ let get_owner perm = perm.owner - * *) - let remove_domid ~domid perm = - let acl = List.filter (fun (acl_domid, _) -> acl_domid <> domid) perm.acl in -- let owner = if perm.owner = domid then 0 else perm.owner in -- { perm with acl; owner } -+ if perm.owner = domid then None else Some { perm with acl; owner = perm.owner } - - let default0 = create 0 NONE [] - -diff --git a/tools/ocaml/xenstored/store.ml b/tools/ocaml/xenstored/store.ml -index 20e67b142746..70f0c83de404 100644 ---- a/tools/ocaml/xenstored/store.ml -+++ b/tools/ocaml/xenstored/store.ml -@@ -87,10 +87,21 @@ let check_owner node connection = - - let rec recurse fct node = fct node; SymbolMap.iter (fun _ -> recurse fct) node.children - --(** [recurse_map f tree] applies [f] on each node in the tree recursively *) --let recurse_map f = -+(** [recurse_filter_map f tree] applies [f] on each node in the tree recursively, -+ possibly removing some nodes. -+ Note that the nodes removed this way won't generate watch events. -+*) -+let recurse_filter_map f = -+ let invalid = -1 in -+ let is_valid _ node = node.perms.owner <> invalid in - let rec walk node = -- f { node with children = SymbolMap.map walk node.children } -+ (* Map.filter_map is Ocaml 4.11+ only *) -+ let node = -+ { node with children = -+ SymbolMap.map walk node.children |> SymbolMap.filter is_valid } in -+ match f node with -+ | Some keep -> keep -+ | None -> { node with perms = {node.perms with owner = invalid } } - in - walk - -@@ -444,11 +455,13 @@ let setperms store perm path nperms = - - let reset_permissions store domid = - Logging.info "store|node" "Cleaning up xenstore ACLs for domid %d" domid; -- store.root <- Node.recurse_map (fun node -> -- let perms = Perms.Node.remove_domid ~domid node.perms in -- if perms <> node.perms then -- Logging.debug "store|node" "Changed permissions for node %s" (Node.get_name node); -- { node with perms } -+ store.root <- Node.recurse_filter_map (fun node -> -+ match Perms.Node.remove_domid ~domid node.perms with -+ | None -> None -+ | Some perms -> -+ if perms <> node.perms then -+ Logging.debug "store|node" "Changed permissions for node %s" (Node.get_name node); -+ Some { node with perms } - ) store.root - - type ops = { --- -2.37.4 - diff --git a/0083-tools-ocaml-Ensure-packet-size-is-never-negative.patch b/0083-tools-ocaml-Ensure-packet-size-is-never-negative.patch deleted file mode 100644 index 5fc3c77..0000000 --- a/0083-tools-ocaml-Ensure-packet-size-is-never-negative.patch +++ /dev/null @@ -1,75 +0,0 @@ -From 635390415f4a9c0621330f0b40f8c7e914c4523f Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= <edvin.torok@citrix.com> -Date: Wed, 12 Oct 2022 19:13:05 +0100 -Subject: [PATCH 83/87] tools/ocaml: Ensure packet size is never negative -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Integers in Ocaml have 63 or 31 bits of signed precision. - -On 64-bit builds of Ocaml, this is fine because a C uint32_t always fits -within a 63-bit signed integer. - -In 32-bit builds of Ocaml, this goes wrong. The C uint32_t is truncated -first (loses the top bit), then has a unsigned/signed mismatch. - -A "negative" value (i.e. a packet on the ring of between 1G and 2G in size) -will trigger an exception later in Bytes.make in xb.ml, and because the packet -is not removed from the ring, the exception re-triggers on every subsequent -query, creating a livelock. - -Fix both the source of the exception in Xb, and as defence in depth, mark the -domain as bad for any Invalid_argument exceptions to avoid the risk of -livelock. - -This is XSA-420 / CVE-2022-42324. - -Signed-off-by: Edwin Török <edvin.torok@citrix.com> -Acked-by: Christian Lindig <christian.lindig@citrix.com> -(cherry picked from commit ae34df4d82636f4c82700b447ea2c93b9f82b3f3) ---- - tools/ocaml/libs/xb/partial.ml | 6 +++--- - tools/ocaml/xenstored/process.ml | 2 +- - 2 files changed, 4 insertions(+), 4 deletions(-) - -diff --git a/tools/ocaml/libs/xb/partial.ml b/tools/ocaml/libs/xb/partial.ml -index b6e2a716e263..3aa8927eb7f0 100644 ---- a/tools/ocaml/libs/xb/partial.ml -+++ b/tools/ocaml/libs/xb/partial.ml -@@ -36,7 +36,7 @@ let of_string s = - This will leave the guest connection is a bad state and will - be hard to recover from without restarting the connection - (ie rebooting the guest) *) -- let dlen = min xenstore_payload_max dlen in -+ let dlen = max 0 (min xenstore_payload_max dlen) in - { - tid = tid; - rid = rid; -@@ -46,8 +46,8 @@ let of_string s = - } - - let append pkt s sz = -- if pkt.len > 4096 then failwith "Buffer.add: cannot grow buffer"; -- Buffer.add_string pkt.buf (String.sub s 0 sz) -+ if Buffer.length pkt.buf + sz > xenstore_payload_max then failwith "Buffer.add: cannot grow buffer"; -+ Buffer.add_substring pkt.buf s 0 sz - - let to_complete pkt = - pkt.len - (Buffer.length pkt.buf) -diff --git a/tools/ocaml/xenstored/process.ml b/tools/ocaml/xenstored/process.ml -index ce39ce28b5f3..6cb990ee7fb2 100644 ---- a/tools/ocaml/xenstored/process.ml -+++ b/tools/ocaml/xenstored/process.ml -@@ -722,7 +722,7 @@ let do_input store cons doms con = - History.reconnect con; - info "%s reconnection complete" (Connection.get_domstr con); - None -- | Failure exp -> -+ | Invalid_argument exp | Failure exp -> - error "caught exception %s" exp; - error "got a bad client %s" (sprintf "%-8s" (Connection.get_domstr con)); - Connection.mark_as_bad con; --- -2.37.4 - diff --git a/0084-tools-xenstore-fix-deleting-node-in-transaction.patch b/0084-tools-xenstore-fix-deleting-node-in-transaction.patch deleted file mode 100644 index 4ab044c..0000000 --- a/0084-tools-xenstore-fix-deleting-node-in-transaction.patch +++ /dev/null @@ -1,46 +0,0 @@ -From 4305807dfdc183f4acd170fe00eb66b338fa6430 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:13 +0200 -Subject: [PATCH 84/87] tools/xenstore: fix deleting node in transaction - -In case a node has been created in a transaction and it is later -deleted in the same transaction, the transaction will be terminated -with an error. - -As this error is encountered only when handling the deleted node at -transaction finalization, the transaction will have been performed -partially and without updating the accounting information. This will -enable a malicious guest to create arbitrary number of nodes. - -This is part of XSA-421 / CVE-2022-42325. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Tested-by: Julien Grall <jgrall@amazon.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit 13ac37f1416cae88d97f7baf6cf2a827edb9a187) ---- - tools/xenstore/xenstored_transaction.c | 8 +++++++- - 1 file changed, 7 insertions(+), 1 deletion(-) - -diff --git a/tools/xenstore/xenstored_transaction.c b/tools/xenstore/xenstored_transaction.c -index 3e3eb47326cc..7ffe21bb5285 100644 ---- a/tools/xenstore/xenstored_transaction.c -+++ b/tools/xenstore/xenstored_transaction.c -@@ -418,7 +418,13 @@ static int finalize_transaction(struct connection *conn, - true); - talloc_free(data.dptr); - } else { -- ret = do_tdb_delete(conn, &key, NULL); -+ /* -+ * A node having been created and later deleted -+ * in this transaction will have no generation -+ * information stored. -+ */ -+ ret = (i->generation == NO_GENERATION) -+ ? 0 : do_tdb_delete(conn, &key, NULL); - } - if (ret) - goto err; --- -2.37.4 - diff --git a/0085-tools-xenstore-harden-transaction-finalization-again.patch b/0085-tools-xenstore-harden-transaction-finalization-again.patch deleted file mode 100644 index 6718ae7..0000000 --- a/0085-tools-xenstore-harden-transaction-finalization-again.patch +++ /dev/null @@ -1,410 +0,0 @@ -From 1bdd7c438b399e2ecce9e3c72bd7c1ae56df60f8 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 13 Sep 2022 07:35:14 +0200 -Subject: [PATCH 85/87] tools/xenstore: harden transaction finalization against - errors - -When finalizing a transaction, any error occurring after checking for -conflicts will result in the transaction being performed only -partially today. Additionally accounting data will not be updated at -the end of the transaction, which might result in further problems -later. - -Avoid those problems by multiple modifications: - -- free any transaction specific nodes which don't need to be committed - as they haven't been written during the transaction as soon as their - generation count has been verified, this will reduce the risk of - out-of-memory situations - -- store the transaction specific node name in struct accessed_node in - order to avoid the need to allocate additional memory for it when - finalizing the transaction - -- don't stop the transaction finalization when hitting an error - condition, but try to continue to handle all modified nodes - -- in case of a detected error do the accounting update as needed and - call the data base checking only after that - -- if writing a node in a transaction is failing (e.g. due to a failed - quota check), fail the transaction, as prior changes to struct - accessed_node can't easily be undone in that case - -This is part of XSA-421 / CVE-2022-42326. - -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Julien Grall <jgrall@amazon.com> -Tested-by: Julien Grall <jgrall@amazon.com> -(cherry picked from commit 2dd823ca7237e7fb90c890642d6a3b357a26fcff) ---- - tools/xenstore/xenstored_core.c | 16 ++- - tools/xenstore/xenstored_transaction.c | 171 +++++++++++-------------- - tools/xenstore/xenstored_transaction.h | 4 +- - 3 files changed, 92 insertions(+), 99 deletions(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 041124d8b7a5..ccb7f0a92578 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -727,8 +727,7 @@ struct node *read_node(struct connection *conn, const void *ctx, - return NULL; - } - -- if (transaction_prepend(conn, name, &key)) -- return NULL; -+ transaction_prepend(conn, name, &key); - - data = tdb_fetch(tdb_ctx, key); - -@@ -846,10 +845,21 @@ int write_node_raw(struct connection *conn, TDB_DATA *key, struct node *node, - static int write_node(struct connection *conn, struct node *node, - bool no_quota_check) - { -+ int ret; -+ - if (access_node(conn, node, NODE_ACCESS_WRITE, &node->key)) - return errno; - -- return write_node_raw(conn, &node->key, node, no_quota_check); -+ ret = write_node_raw(conn, &node->key, node, no_quota_check); -+ if (ret && conn && conn->transaction) { -+ /* -+ * Reverting access_node() is hard, so just fail the -+ * transaction. -+ */ -+ fail_transaction(conn->transaction); -+ } -+ -+ return ret; - } - - unsigned int perm_for_conn(struct connection *conn, -diff --git a/tools/xenstore/xenstored_transaction.c b/tools/xenstore/xenstored_transaction.c -index 7ffe21bb5285..ac854197cadb 100644 ---- a/tools/xenstore/xenstored_transaction.c -+++ b/tools/xenstore/xenstored_transaction.c -@@ -114,7 +114,8 @@ struct accessed_node - struct list_head list; - - /* The name of the node. */ -- char *node; -+ char *trans_name; /* Transaction specific name. */ -+ char *node; /* Main data base name. */ - - /* Generation count (or NO_GENERATION) for conflict checking. */ - uint64_t generation; -@@ -199,25 +200,20 @@ static char *transaction_get_node_name(void *ctx, struct transaction *trans, - * Prepend the transaction to name if node has been modified in the current - * transaction. - */ --int transaction_prepend(struct connection *conn, const char *name, -- TDB_DATA *key) -+void transaction_prepend(struct connection *conn, const char *name, -+ TDB_DATA *key) - { -- char *tdb_name; -+ struct accessed_node *i; - -- if (!conn || !conn->transaction || -- !find_accessed_node(conn->transaction, name)) { -- set_tdb_key(name, key); -- return 0; -+ if (conn && conn->transaction) { -+ i = find_accessed_node(conn->transaction, name); -+ if (i) { -+ set_tdb_key(i->trans_name, key); -+ return; -+ } - } - -- tdb_name = transaction_get_node_name(conn->transaction, -- conn->transaction, name); -- if (!tdb_name) -- return errno; -- -- set_tdb_key(tdb_name, key); -- -- return 0; -+ set_tdb_key(name, key); - } - - /* -@@ -240,7 +236,6 @@ int access_node(struct connection *conn, struct node *node, - struct accessed_node *i = NULL; - struct transaction *trans; - TDB_DATA local_key; -- const char *trans_name = NULL; - int ret; - bool introduce = false; - -@@ -259,10 +254,6 @@ int access_node(struct connection *conn, struct node *node, - - trans = conn->transaction; - -- trans_name = transaction_get_node_name(node, trans, node->name); -- if (!trans_name) -- goto nomem; -- - i = find_accessed_node(trans, node->name); - if (!i) { - if (trans->nodes >= quota_trans_nodes && -@@ -273,9 +264,10 @@ int access_node(struct connection *conn, struct node *node, - i = talloc_zero(trans, struct accessed_node); - if (!i) - goto nomem; -- i->node = talloc_strdup(i, node->name); -- if (!i->node) -+ i->trans_name = transaction_get_node_name(i, trans, node->name); -+ if (!i->trans_name) - goto nomem; -+ i->node = strchr(i->trans_name, '/') + 1; - if (node->generation != NO_GENERATION && node->perms.num) { - i->perms.p = talloc_array(i, struct xs_permissions, - node->perms.num); -@@ -302,7 +294,7 @@ int access_node(struct connection *conn, struct node *node, - i->generation = node->generation; - i->check_gen = true; - if (node->generation != NO_GENERATION) { -- set_tdb_key(trans_name, &local_key); -+ set_tdb_key(i->trans_name, &local_key); - ret = write_node_raw(conn, &local_key, node, true); - if (ret) - goto err; -@@ -321,7 +313,7 @@ int access_node(struct connection *conn, struct node *node, - return -1; - - if (key) { -- set_tdb_key(trans_name, key); -+ set_tdb_key(i->trans_name, key); - if (type == NODE_ACCESS_WRITE) - i->ta_node = true; - if (type == NODE_ACCESS_DELETE) -@@ -333,7 +325,6 @@ int access_node(struct connection *conn, struct node *node, - nomem: - ret = ENOMEM; - err: -- talloc_free((void *)trans_name); - talloc_free(i); - trans->fail = true; - errno = ret; -@@ -371,100 +362,90 @@ void queue_watches(struct connection *conn, const char *name, bool watch_exact) - * base. - */ - static int finalize_transaction(struct connection *conn, -- struct transaction *trans) -+ struct transaction *trans, bool *is_corrupt) - { -- struct accessed_node *i; -+ struct accessed_node *i, *n; - TDB_DATA key, ta_key, data; - struct xs_tdb_record_hdr *hdr; - uint64_t gen; -- char *trans_name; -- int ret; - -- list_for_each_entry(i, &trans->accessed, list) { -- if (!i->check_gen) -- continue; -+ list_for_each_entry_safe(i, n, &trans->accessed, list) { -+ if (i->check_gen) { -+ set_tdb_key(i->node, &key); -+ data = tdb_fetch(tdb_ctx, key); -+ hdr = (void *)data.dptr; -+ if (!data.dptr) { -+ if (tdb_error(tdb_ctx) != TDB_ERR_NOEXIST) -+ return EIO; -+ gen = NO_GENERATION; -+ } else -+ gen = hdr->generation; -+ talloc_free(data.dptr); -+ if (i->generation != gen) -+ return EAGAIN; -+ } - -- set_tdb_key(i->node, &key); -- data = tdb_fetch(tdb_ctx, key); -- hdr = (void *)data.dptr; -- if (!data.dptr) { -- if (tdb_error(tdb_ctx) != TDB_ERR_NOEXIST) -- return EIO; -- gen = NO_GENERATION; -- } else -- gen = hdr->generation; -- talloc_free(data.dptr); -- if (i->generation != gen) -- return EAGAIN; -+ /* Entries for unmodified nodes can be removed early. */ -+ if (!i->modified) { -+ if (i->ta_node) { -+ set_tdb_key(i->trans_name, &ta_key); -+ if (do_tdb_delete(conn, &ta_key, NULL)) -+ return EIO; -+ } -+ list_del(&i->list); -+ talloc_free(i); -+ } - } - - while ((i = list_top(&trans->accessed, struct accessed_node, list))) { -- trans_name = transaction_get_node_name(i, trans, i->node); -- if (!trans_name) -- /* We are doomed: the transaction is only partial. */ -- goto err; -- -- set_tdb_key(trans_name, &ta_key); -- -- if (i->modified) { -- set_tdb_key(i->node, &key); -- if (i->ta_node) { -- data = tdb_fetch(tdb_ctx, ta_key); -- if (!data.dptr) -- goto err; -+ set_tdb_key(i->node, &key); -+ if (i->ta_node) { -+ set_tdb_key(i->trans_name, &ta_key); -+ data = tdb_fetch(tdb_ctx, ta_key); -+ if (data.dptr) { - hdr = (void *)data.dptr; - hdr->generation = ++generation; -- ret = do_tdb_write(conn, &key, &data, NULL, -- true); -+ *is_corrupt |= do_tdb_write(conn, &key, &data, -+ NULL, true); - talloc_free(data.dptr); -+ if (do_tdb_delete(conn, &ta_key, NULL)) -+ *is_corrupt = true; - } else { -- /* -- * A node having been created and later deleted -- * in this transaction will have no generation -- * information stored. -- */ -- ret = (i->generation == NO_GENERATION) -- ? 0 : do_tdb_delete(conn, &key, NULL); -- } -- if (ret) -- goto err; -- if (i->fire_watch) { -- fire_watches(conn, trans, i->node, NULL, -- i->watch_exact, -- i->perms.p ? &i->perms : NULL); -+ *is_corrupt = true; - } -+ } else { -+ /* -+ * A node having been created and later deleted -+ * in this transaction will have no generation -+ * information stored. -+ */ -+ *is_corrupt |= (i->generation == NO_GENERATION) -+ ? false -+ : do_tdb_delete(conn, &key, NULL); - } -+ if (i->fire_watch) -+ fire_watches(conn, trans, i->node, NULL, i->watch_exact, -+ i->perms.p ? &i->perms : NULL); - -- if (i->ta_node && do_tdb_delete(conn, &ta_key, NULL)) -- goto err; - list_del(&i->list); - talloc_free(i); - } - - return 0; -- --err: -- corrupt(conn, "Partial transaction"); -- return EIO; - } - - static int destroy_transaction(void *_transaction) - { - struct transaction *trans = _transaction; - struct accessed_node *i; -- char *trans_name; - TDB_DATA key; - - wrl_ntransactions--; - trace_destroy(trans, "transaction"); - while ((i = list_top(&trans->accessed, struct accessed_node, list))) { - if (i->ta_node) { -- trans_name = transaction_get_node_name(i, trans, -- i->node); -- if (trans_name) { -- set_tdb_key(trans_name, &key); -- do_tdb_delete(trans->conn, &key, NULL); -- } -+ set_tdb_key(i->trans_name, &key); -+ do_tdb_delete(trans->conn, &key, NULL); - } - list_del(&i->list); - talloc_free(i); -@@ -556,6 +537,7 @@ int do_transaction_end(const void *ctx, struct connection *conn, - { - const char *arg = onearg(in); - struct transaction *trans; -+ bool is_corrupt = false; - int ret; - - if (!arg || (!streq(arg, "T") && !streq(arg, "F"))) -@@ -579,13 +561,17 @@ int do_transaction_end(const void *ctx, struct connection *conn, - ret = transaction_fix_domains(trans, false); - if (ret) - return ret; -- if (finalize_transaction(conn, trans)) -- return EAGAIN; -+ ret = finalize_transaction(conn, trans, &is_corrupt); -+ if (ret) -+ return ret; - - wrl_apply_debit_trans_commit(conn); - - /* fix domain entry for each changed domain */ - transaction_fix_domains(trans, true); -+ -+ if (is_corrupt) -+ corrupt(conn, "transaction inconsistency"); - } - send_ack(conn, XS_TRANSACTION_END); - -@@ -660,7 +646,7 @@ int check_transactions(struct hashtable *hash) - struct connection *conn; - struct transaction *trans; - struct accessed_node *i; -- char *tname, *tnode; -+ char *tname; - - list_for_each_entry(conn, &connections, list) { - list_for_each_entry(trans, &conn->transaction_list, list) { -@@ -672,11 +658,8 @@ int check_transactions(struct hashtable *hash) - list_for_each_entry(i, &trans->accessed, list) { - if (!i->ta_node) - continue; -- tnode = transaction_get_node_name(tname, trans, -- i->node); -- if (!tnode || !remember_string(hash, tnode)) -+ if (!remember_string(hash, i->trans_name)) - goto nomem; -- talloc_free(tnode); - } - - talloc_free(tname); -diff --git a/tools/xenstore/xenstored_transaction.h b/tools/xenstore/xenstored_transaction.h -index 39d7f81c5127..3417303f9427 100644 ---- a/tools/xenstore/xenstored_transaction.h -+++ b/tools/xenstore/xenstored_transaction.h -@@ -48,8 +48,8 @@ int __must_check access_node(struct connection *conn, struct node *node, - void queue_watches(struct connection *conn, const char *name, bool watch_exact); - - /* Prepend the transaction to name if appropriate. */ --int transaction_prepend(struct connection *conn, const char *name, -- TDB_DATA *key); -+void transaction_prepend(struct connection *conn, const char *name, -+ TDB_DATA *key); - - /* Mark the transaction as failed. This will prevent it to be committed. */ - void fail_transaction(struct transaction *trans); --- -2.37.4 - diff --git a/0086-x86-spec-ctrl-Enumeration-for-IBPB_RET.patch b/0086-x86-spec-ctrl-Enumeration-for-IBPB_RET.patch deleted file mode 100644 index c15c285..0000000 --- a/0086-x86-spec-ctrl-Enumeration-for-IBPB_RET.patch +++ /dev/null @@ -1,82 +0,0 @@ -From b1a1df345aaf359f305d6d041e571929c9252645 Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Tue, 14 Jun 2022 16:18:36 +0100 -Subject: [PATCH 86/87] x86/spec-ctrl: Enumeration for IBPB_RET - -The IBPB_RET bit indicates that the CPU's implementation of MSR_PRED_CMD.IBPB -does flush the RSB/RAS too. - -This is part of XSA-422 / CVE-2022-23824. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Acked-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit 24496558e650535bdbd22cc04731e82276cd1b3f) ---- - tools/libs/light/libxl_cpuid.c | 1 + - tools/misc/xen-cpuid.c | 1 + - xen/arch/x86/spec_ctrl.c | 5 +++-- - xen/include/public/arch-x86/cpufeatureset.h | 1 + - 4 files changed, 6 insertions(+), 2 deletions(-) - -diff --git a/tools/libs/light/libxl_cpuid.c b/tools/libs/light/libxl_cpuid.c -index bf6fdee360a9..691d5c6b2a68 100644 ---- a/tools/libs/light/libxl_cpuid.c -+++ b/tools/libs/light/libxl_cpuid.c -@@ -289,6 +289,7 @@ int libxl_cpuid_parse_config(libxl_cpuid_policy_list *cpuid, const char* str) - {"ssb-no", 0x80000008, NA, CPUID_REG_EBX, 26, 1}, - {"psfd", 0x80000008, NA, CPUID_REG_EBX, 28, 1}, - {"btc-no", 0x80000008, NA, CPUID_REG_EBX, 29, 1}, -+ {"ibpb-ret", 0x80000008, NA, CPUID_REG_EBX, 30, 1}, - - {"nc", 0x80000008, NA, CPUID_REG_ECX, 0, 8}, - {"apicidsize", 0x80000008, NA, CPUID_REG_ECX, 12, 4}, -diff --git a/tools/misc/xen-cpuid.c b/tools/misc/xen-cpuid.c -index fe22f5f5b68b..cd094427dd4c 100644 ---- a/tools/misc/xen-cpuid.c -+++ b/tools/misc/xen-cpuid.c -@@ -159,6 +159,7 @@ static const char *const str_e8b[32] = - [24] = "amd-ssbd", [25] = "virt-ssbd", - [26] = "ssb-no", - [28] = "psfd", [29] = "btc-no", -+ [30] = "ibpb-ret", - }; - - static const char *const str_7d0[32] = -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index 0f4bad3d3abb..16a562d3a172 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -419,7 +419,7 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) - * Hardware read-only information, stating immunity to certain issues, or - * suggestions of which mitigation to use. - */ -- printk(" Hardware hints:%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", -+ printk(" Hardware hints:%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", - (caps & ARCH_CAPS_RDCL_NO) ? " RDCL_NO" : "", - (caps & ARCH_CAPS_IBRS_ALL) ? " IBRS_ALL" : "", - (caps & ARCH_CAPS_RSBA) ? " RSBA" : "", -@@ -436,7 +436,8 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) - (e8b & cpufeat_mask(X86_FEATURE_STIBP_ALWAYS)) ? " STIBP_ALWAYS" : "", - (e8b & cpufeat_mask(X86_FEATURE_IBRS_FAST)) ? " IBRS_FAST" : "", - (e8b & cpufeat_mask(X86_FEATURE_IBRS_SAME_MODE)) ? " IBRS_SAME_MODE" : "", -- (e8b & cpufeat_mask(X86_FEATURE_BTC_NO)) ? " BTC_NO" : ""); -+ (e8b & cpufeat_mask(X86_FEATURE_BTC_NO)) ? " BTC_NO" : "", -+ (e8b & cpufeat_mask(X86_FEATURE_IBPB_RET)) ? " IBPB_RET" : ""); - - /* Hardware features which need driving to mitigate issues. */ - printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s\n", -diff --git a/xen/include/public/arch-x86/cpufeatureset.h b/xen/include/public/arch-x86/cpufeatureset.h -index e7b8167800a2..e0731221404c 100644 ---- a/xen/include/public/arch-x86/cpufeatureset.h -+++ b/xen/include/public/arch-x86/cpufeatureset.h -@@ -267,6 +267,7 @@ XEN_CPUFEATURE(VIRT_SSBD, 8*32+25) /* MSR_VIRT_SPEC_CTRL.SSBD */ - XEN_CPUFEATURE(SSB_NO, 8*32+26) /*A Hardware not vulnerable to SSB */ - XEN_CPUFEATURE(PSFD, 8*32+28) /*S MSR_SPEC_CTRL.PSFD */ - XEN_CPUFEATURE(BTC_NO, 8*32+29) /*A Hardware not vulnerable to Branch Type Confusion */ -+XEN_CPUFEATURE(IBPB_RET, 8*32+30) /*A IBPB clears RSB/RAS too. */ - - /* Intel-defined CPU features, CPUID level 0x00000007:0.edx, word 9 */ - XEN_CPUFEATURE(AVX512_4VNNIW, 9*32+ 2) /*A AVX512 Neural Network Instructions */ --- -2.37.4 - diff --git a/0087-x86-spec-ctrl-Mitigate-IBPB-not-flushing-the-RSB-RAS.patch b/0087-x86-spec-ctrl-Mitigate-IBPB-not-flushing-the-RSB-RAS.patch deleted file mode 100644 index 9bcb4d3..0000000 --- a/0087-x86-spec-ctrl-Mitigate-IBPB-not-flushing-the-RSB-RAS.patch +++ /dev/null @@ -1,113 +0,0 @@ -From c1e196ab490b47ce42037c2fef8184a19d96922b Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Tue, 14 Jun 2022 16:18:36 +0100 -Subject: [PATCH 87/87] x86/spec-ctrl: Mitigate IBPB not flushing the RSB/RAS - -Introduce spec_ctrl_new_guest_context() to encapsulate all logic pertaining to -using MSR_PRED_CMD for a new guest context, even if it only has one user -presently. - -Introduce X86_BUG_IBPB_NO_RET, and use it extend spec_ctrl_new_guest_context() -with a manual fixup for hardware which mis-implements IBPB. - -This is part of XSA-422 / CVE-2022-23824. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Acked-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit 2b27967fb89d7904a1571a2fb963b1c9cac548db) ---- - xen/arch/x86/asm-macros.c | 1 + - xen/arch/x86/domain.c | 2 +- - xen/arch/x86/spec_ctrl.c | 8 ++++++++ - xen/include/asm-x86/cpufeatures.h | 1 + - xen/include/asm-x86/spec_ctrl.h | 22 ++++++++++++++++++++++ - 5 files changed, 33 insertions(+), 1 deletion(-) - -diff --git a/xen/arch/x86/asm-macros.c b/xen/arch/x86/asm-macros.c -index 7e536b0d82f5..891d86c7655c 100644 ---- a/xen/arch/x86/asm-macros.c -+++ b/xen/arch/x86/asm-macros.c -@@ -1,2 +1,3 @@ - #include <asm/asm-defns.h> - #include <asm/alternative-asm.h> -+#include <asm/spec_ctrl_asm.h> -diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c -index 3fab2364be8d..3080cde62b5b 100644 ---- a/xen/arch/x86/domain.c -+++ b/xen/arch/x86/domain.c -@@ -2092,7 +2092,7 @@ void context_switch(struct vcpu *prev, struct vcpu *next) - */ - if ( *last_id != next_id ) - { -- wrmsrl(MSR_PRED_CMD, PRED_CMD_IBPB); -+ spec_ctrl_new_guest_context(); - *last_id = next_id; - } - } -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index 16a562d3a172..90d86fe5cb47 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -804,6 +804,14 @@ static void __init ibpb_calculations(void) - return; - } - -+ /* -+ * AMD/Hygon CPUs to date (June 2022) don't flush the the RAS. Future -+ * CPUs are expected to enumerate IBPB_RET when this has been fixed. -+ * Until then, cover the difference with the software sequence. -+ */ -+ if ( boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_IBPB_RET) ) -+ setup_force_cpu_cap(X86_BUG_IBPB_NO_RET); -+ - /* - * IBPB-on-entry mitigations for Branch Type Confusion. - * -diff --git a/xen/include/asm-x86/cpufeatures.h b/xen/include/asm-x86/cpufeatures.h -index 672c9ee22ba2..ecc1bb09505a 100644 ---- a/xen/include/asm-x86/cpufeatures.h -+++ b/xen/include/asm-x86/cpufeatures.h -@@ -49,6 +49,7 @@ XEN_CPUFEATURE(IBPB_ENTRY_HVM, X86_SYNTH(29)) /* MSR_PRED_CMD used by Xen for - #define X86_BUG_FPU_PTRS X86_BUG( 0) /* (F)X{SAVE,RSTOR} doesn't save/restore FOP/FIP/FDP. */ - #define X86_BUG_NULL_SEG X86_BUG( 1) /* NULL-ing a selector preserves the base and limit. */ - #define X86_BUG_CLFLUSH_MFENCE X86_BUG( 2) /* MFENCE needed to serialise CLFLUSH */ -+#define X86_BUG_IBPB_NO_RET X86_BUG( 3) /* IBPB doesn't flush the RSB/RAS */ - - /* Total number of capability words, inc synth and bug words. */ - #define NCAPINTS (FSCAPINTS + X86_NR_SYNTH + X86_NR_BUG) /* N 32-bit words worth of info */ -diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h -index 9403b81dc7af..6a77c3937844 100644 ---- a/xen/include/asm-x86/spec_ctrl.h -+++ b/xen/include/asm-x86/spec_ctrl.h -@@ -65,6 +65,28 @@ - void init_speculation_mitigations(void); - void spec_ctrl_init_domain(struct domain *d); - -+/* -+ * Switch to a new guest prediction context. -+ * -+ * This flushes all indirect branch predictors (BTB, RSB/RAS), so guest code -+ * which has previously run on this CPU can't attack subsequent guest code. -+ * -+ * As this flushes the RSB/RAS, it destroys the predictions of the calling -+ * context. For best performace, arrange for this to be used when we're going -+ * to jump out of the current context, e.g. with reset_stack_and_jump(). -+ * -+ * For hardware which mis-implements IBPB, fix up by flushing the RSB/RAS -+ * manually. -+ */ -+static always_inline void spec_ctrl_new_guest_context(void) -+{ -+ wrmsrl(MSR_PRED_CMD, PRED_CMD_IBPB); -+ -+ /* (ab)use alternative_input() to specify clobbers. */ -+ alternative_input("", "DO_OVERWRITE_RSB", X86_BUG_IBPB_NO_RET, -+ : "rax", "rcx"); -+} -+ - extern int8_t opt_ibpb_ctxt_switch; - extern bool opt_ssbd; - extern int8_t opt_eager_fpu; --- -2.37.4 - @@ -1,6 +1,6 @@ -Xen upstream patchset #1 for 4.16.3-pre +Xen upstream patchset #0 for 4.16.4-pre Containing patches from -RELEASE-4.16.2 (1871bd1c9eb934f0ffd039f3d68e42fd0097f322) +RELEASE-4.16.3 (08c42cec2f3dbb8d1df62c2ad4945d127b418fd6) to -staging-4.16 (c1e196ab490b47ce42037c2fef8184a19d96922b) +staging-4.16 (4ad5975d4e35635f03d2cb9e86292c0daeabd75f) |