kata-containers/patches/0003-configuration-add-configuration-for-StratoVirt-hyper.patch
2025-05-18 17:30:05 +00:00

549 lines
22 KiB
Diff

From 2d0431ff3051b9267919a543906c8fd5547facf9 Mon Sep 17 00:00:00 2001
From: Liu Wenyuan <liuwenyuan9@huawei.com>
Date: Wed, 23 Aug 2023 17:49:56 +0800
Subject: [PATCH 3/5] configuration: add configuration for StratoVirt
hypervisor.
Add configuration-stratovirt.toml.in to generate the StratoVirt configuration,
and parser to deliver config to StratoVirt.
Signed-off-by: Liu Wenyuan <liuwenyuan9@huawei.com>
---
.../config/configuration-stratovirt.toml.in | 394 ++++++++++++++++++
src/runtime/pkg/katautils/config.go | 104 +++++
2 files changed, 498 insertions(+)
create mode 100644 src/runtime/config/configuration-stratovirt.toml.in
diff --git a/src/runtime/config/configuration-stratovirt.toml.in b/src/runtime/config/configuration-stratovirt.toml.in
new file mode 100644
index 0000000..c98cf6b
--- /dev/null
+++ b/src/runtime/config/configuration-stratovirt.toml.in
@@ -0,0 +1,394 @@
+# Copyright (c) 2023 Huawei Technologies Co.,Ltd.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+
+# XXX: WARNING: this file is auto-generated.
+# XXX:
+# XXX: Source file: "@CONFIG_STRATOVIRT_IN@"
+# XXX: Project:
+# XXX: Name: @PROJECT_NAME@
+# XXX: Type: @PROJECT_TYPE@
+
+[hypervisor.stratovirt]
+path = "@STRATOVIRTPATH@"
+kernel = "@KERNELPATH_STRATOVIRT@"
+#image = "@IMAGEPATH@"
+initrd = "@INITRDPATH@"
+machine_type = "@DEFMACHINETYPE_STRATOVIRT@"
+
+# rootfs filesystem type:
+# - ext4 (default)
+# - xfs
+# - erofs
+rootfs_type = @DEFROOTFSTYPE@
+
+# List of valid annotation names for the hypervisor
+# Each member of the list is a regular expression, which is the base name
+# of the annotation, e.g. "path" for io.katacontainers.config.hypervisor.path"
+enable_annotations = @DEFENABLEANNOTATIONS@
+
+# List of valid annotations values for the hypervisor
+# Each member of the list is a path pattern as described by glob(3).
+# The default if not set is empty (all annotations rejected.)
+# Your distribution recommends: @STRATOVIRTVALIDHYPERVISORPATHS@
+valid_hypervisor_paths = @STRATOVIRTVALIDHYPERVISORPATHS@
+
+# Optional space-separated list of options to pass to the guest kernel.
+# For example, use `kernel_params = "vsyscall=emulate"` if you are having
+# trouble running pre-2.15 glibc.
+#
+# WARNING: - any parameter specified here will take priority over the default
+# parameter value of the same name used to start the virtual machine.
+# Do not set values here unless you understand the impact of doing so as you
+# may stop the virtual machine from booting.
+# To see the list of default parameters, enable hypervisor debug, create a
+# container and look for 'default-kernel-parameters' log entries.
+kernel_params = "@KERNELPARAMS@"
+
+# Default number of vCPUs per SB/VM:
+# unspecified or 0 --> will be set to @DEFVCPUS@
+# < 0 --> will be set to the actual number of physical cores
+# > 0 <= number of physical cores --> will be set to the specified number
+# > number of physical cores --> will be set to the actual number of physical cores
+default_vcpus = 1
+
+# Default maximum number of vCPUs per SB/VM:
+# unspecified or == 0 --> will be set to the actual number of physical cores or to the maximum number
+# of vCPUs supported by KVM if that number is exceeded
+# > 0 <= number of physical cores --> will be set to the specified number
+# > number of physical cores --> will be set to the actual number of physical cores or to the maximum number
+# of vCPUs supported by KVM if that number is exceeded
+# WARNING: Depending of the architecture, the maximum number of vCPUs supported by KVM is used when
+# the actual number of physical cores is greater than it.
+# WARNING: Be aware that this value impacts the virtual machine's memory footprint and CPU
+# the hotplug functionality. For example, `default_maxvcpus = 240` specifies that until 240 vCPUs
+# can be added to a SB/VM, but the memory footprint will be big. Another example, with
+# `default_maxvcpus = 8` the memory footprint will be small, but 8 will be the maximum number of
+# vCPUs supported by the SB/VM. In general, we recommend that you do not edit this variable,
+# unless you know what are you doing.
+# NOTICE: on arm platform with gicv2 interrupt controller, set it to 8.
+default_maxvcpus = @DEFMAXVCPUS@
+
+# Bridges can be used to hot plug devices.
+# Limitations:
+# * Currently only pci bridges are supported
+# * Until 30 devices per bridge can be hot plugged.
+# * Until 5 PCI bridges can be cold plugged per VM.
+# This limitation could be a bug in the kernel
+# Default number of bridges per SB/VM:
+# unspecified or 0 --> will be set to @DEFBRIDGES@
+# > 1 <= 5 --> will be set to the specified number
+# > 5 --> will be set to 5
+default_bridges = @DEFBRIDGES@
+
+# Default memory size in MiB for SB/VM.
+# If unspecified then it will be set @DEFMEMSZ@ MiB.
+default_memory = @DEFMEMSZ@
+#
+# Default memory slots per SB/VM.
+# If unspecified then it will be set @DEFMEMSLOTS@.
+# This is will determine the times that memory will be hotadded to sandbox/VM.
+#memory_slots = @DEFMEMSLOTS@
+
+# Default maximum memory in MiB per SB / VM
+# unspecified or == 0 --> will be set to the actual amount of physical RAM
+# > 0 <= amount of physical RAM --> will be set to the specified number
+# > amount of physical RAM --> will be set to the actual amount of physical RAM
+default_maxmemory = @DEFMAXMEMSZ@
+
+# The size in MiB will be plused to max memory of hypervisor.
+# It is the memory address space for the NVDIMM devie.
+# If set block storage driver (block_device_driver) to "nvdimm",
+# should set memory_offset to the size of block device.
+# Default 0
+#memory_offset = 0
+
+# Disable block device from being used for a container's rootfs.
+# In case of a storage driver like devicemapper where a container's
+# root file system is backed by a block device, the block device is passed
+# directly to the hypervisor for performance reasons.
+# This flag prevents the block device from being passed to the hypervisor,
+# virtio-fs is used instead to pass the rootfs.
+disable_block_device_use = @DEFDISABLEBLOCK@
+
+# Shared file system type:
+# - virtio-fs (default)
+# - virtio-fs-nydus
+# - none
+shared_fs = "@DEFSHAREDFS_STRATOVIRT_VIRTIOFS@"
+
+# Path to vhost-user-fs daemon.
+virtio_fs_daemon = "@DEFVIRTIOFSDAEMON@"
+
+# List of valid annotations values for the virtiofs daemon
+# The default if not set is empty (all annotations rejected.)
+valid_virtio_fs_daemon_paths = @DEFVALIDVIRTIOFSDAEMONPATHS@
+
+# Default size of DAX cache in MiB
+virtio_fs_cache_size = @DEFVIRTIOFSCACHESIZE@
+
+# Extra args for virtiofsd daemon
+#
+# Format example:
+# ["--arg1=xxx", "--arg2=yyy"]
+# Examples:
+# Set virtiofsd log level to debug : ["--log-level=debug"]
+#
+# see `virtiofsd -h` for possible options.
+virtio_fs_extra_args = @DEFVIRTIOFSEXTRAARGS@
+
+# Cache mode:
+#
+# - never
+# Metadata, data, and pathname lookup are not cached in guest. They are
+# always fetched from host and any changes are immediately pushed to host.
+#
+# - auto
+# Metadata and pathname lookup cache expires after a configured amount of
+# time (default is 1 second). Data is cached while the file is open (close
+# to open consistency).
+#
+# - always
+# Metadata, data, and pathname lookup are cached in guest and never expire.
+virtio_fs_cache = "@DEFVIRTIOFSCACHE@"
+
+# Block storage driver to be used for the hypervisor in case the container
+# rootfs is backed by a block device. This is virtio-scsi, virtio-blk
+# or nvdimm.
+block_device_driver = "@DEFBLOCKSTORAGEDRIVER_STRATOVIRT@"
+
+# Specifies cache-related options will be set to block devices or not.
+# Default false
+#block_device_cache_set = true
+
+# Specifies cache-related options for block devices.
+# Denotes whether use of O_DIRECT (bypass the host page cache) is enabled.
+# Default false
+#block_device_cache_direct = true
+
+# Specifies cache-related options for block devices.
+# Denotes whether flush requests for the device are ignored.
+# Default false
+#block_device_cache_noflush = true
+
+# Enable huge pages for VM RAM, default false
+# Enabling this will result in the VM memory
+# being allocated using huge pages.
+# This is useful when you want to use vhost-user network
+# stacks within the container. This will automatically
+# result in memory pre allocation
+#enable_hugepages = true
+
+# Enable vIOMMU, default false
+# Enabling this will result in the VM having a vIOMMU device
+# This will also add the following options to the kernel's
+# command line: intel_iommu=on,iommu=pt
+#enable_iommu = true
+
+# This option changes the default hypervisor and kernel parameters
+# to enable debug output where available.
+#
+# Default false
+#enable_debug = true
+
+# Disable the customizations done in the runtime when it detects
+# that it is running on top a VMM. This will result in the runtime
+# behaving as it would when running on bare metal.
+#
+#disable_nesting_checks = true
+
+#
+# Default entropy source.
+# The path to a host source of entropy (including a real hardware RNG)
+# /dev/urandom and /dev/random are two main options.
+# Be aware that /dev/random is a blocking source of entropy. If the host
+# runs out of entropy, the VMs boot time will increase leading to get startup
+# timeouts.
+# The source of entropy /dev/urandom is non-blocking and provides a
+# generally acceptable source of entropy. It should work well for pretty much
+# all practical purposes.
+entropy_source = "@DEFENTROPYSOURCE@"
+
+# Path to OCI hook binaries in the *guest rootfs*.
+# This does not affect host-side hooks which must instead be added to
+# the OCI spec passed to the runtime.
+#
+# You can create a rootfs with hooks by customizing the osbuilder scripts:
+# https://github.com/kata-containers/kata-containers/tree/main/tools/osbuilder
+#
+# Hooks must be stored in a subdirectory of guest_hook_path according to their
+# hook type, i.e. "guest_hook_path/{prestart,poststart,poststop}".
+# The agent will scan these directories for executable files and add them, in
+# lexicographical order, to the lifecycle of the guest container.
+# Hooks are executed in the runtime namespace of the guest. See the official documentation:
+# https://github.com/opencontainers/runtime-spec/blob/v1.0.1/config.md#posix-platform-hooks
+# Warnings will be logged if any error is encountered while scanning for hooks,
+# but it will not abort container execution.
+#guest_hook_path = "/usr/share/oci/hooks"
+
+# disable applying SELinux on the VMM process (default false)
+disable_selinux = @DEFDISABLESELINUX@
+
+# disable applying SELinux on the container process
+# If set to false, the type `container_t` is applied to the container process by default.
+# Note: To enable guest SELinux, the guest rootfs must be CentOS that is created and built
+# with `SELINUX=yes`.
+# (default: true)
+disable_guest_selinux = @DEFDISABLEGUESTSELINUX@
+
+[factory]
+# VM templating support. Once enabled, new VMs are created from template
+# using vm cloning. They will share the same initial kernel, initramfs and
+# agent memory by mapping it readonly. It helps speeding up new container
+# creation and saves a lot of memory if there are many kata containers running
+# on the same host.
+#
+# When disabled, new VMs are created from scratch.
+#
+# Note: Requires "initrd=" to be set ("image=" is not supported).
+#
+# Default false
+#enable_template = true
+
+[agent.@PROJECT_TYPE@]
+# If enabled, make the agent display debug-level messages.
+# (default: disabled)
+#enable_debug = true
+
+# Enable agent tracing.
+#
+# If enabled, the agent will generate OpenTelemetry trace spans.
+#
+# Notes:
+#
+# - If the runtime also has tracing enabled, the agent spans will be
+# associated with the appropriate runtime parent span.
+# - If enabled, the runtime will wait for the container to shutdown,
+# increasing the container shutdown time slightly.
+#
+# (default: disabled)
+#enable_tracing = true
+
+# Comma separated list of kernel modules and their parameters.
+# These modules will be loaded in the guest kernel using modprobe(8).
+# The following example can be used to load two kernel modules with parameters
+# - kernel_modules=["e1000e InterruptThrottleRate=3000,3000,3000 EEE=1", "i915 enable_ppgtt=0"]
+# The first word is considered as the module name and the rest as its parameters.
+# Container will not be started when:
+# * A kernel module is specified and the modprobe command is not installed in the guest
+# or it fails loading the module.
+# * The module is not available in the guest or it doesn't met the guest kernel
+# requirements, like architecture and version.
+#
+kernel_modules = []
+
+# Enable debug console.
+
+# If enabled, user can connect guest OS running inside hypervisor
+# through "kata-runtime exec <sandbox-id>" command
+
+#debug_console_enabled = true
+
+# Agent connection dialing timeout value in seconds
+# (default: 45)
+dial_timeout = 45
+
+[runtime]
+# If enabled, the runtime will log additional debug messages to the
+# system log
+# (default: disabled)
+#enable_debug = true
+#
+# Internetworking model
+# Determines how the VM should be connected to the
+# the container network interface
+# Options:
+#
+# - macvtap
+# Used when the Container network interface can be bridged using
+# macvtap.
+#
+# - none
+# Used when customize network. Only creates a tap device. No veth pair.
+#
+# - tcfilter
+# Uses tc filter rules to redirect traffic from the network interface
+# provided by plugin to a tap interface connected to the VM.
+#
+internetworking_model = "@DEFNETWORKMODEL_STRATOVIRT@"
+
+# disable guest seccomp
+# Determines whether container seccomp profiles are passed to the virtual
+# machine and applied by the kata agent. If set to true, seccomp is not applied
+# within the guest
+# (default: true)
+disable_guest_seccomp = @DEFDISABLEGUESTSECCOMP@
+
+# vCPUs pinning settings
+# if enabled, each vCPU thread will be scheduled to a fixed CPU
+# qualified condition: num(vCPU threads) == num(CPUs in sandbox's CPUSet)
+#enable_vcpus_pinning = false
+
+# Apply a custom SELinux security policy to the container process inside the VM.
+# This is used when you want to apply a type other than the default `container_t`,
+# so general users should not uncomment and apply it.
+# (format: "user:role:type")
+# Note: You cannot specify MCS policy with the label because the sensitivity levels and
+# categories are determined automatically by high-level container runtimes such as containerd.
+#guest_selinux_label = "@DEFGUESTSELINUXLABEL@"
+
+# If enabled, the runtime will create opentracing.io traces and spans.
+# (See https://www.jaegertracing.io/docs/getting-started).
+# (default: disabled)
+#enable_tracing = true
+
+# Set the full url to the Jaeger HTTP Thrift collector.
+# The default if not set will be "http://localhost:14268/api/traces"
+#jaeger_endpoint = ""
+
+# Sets the username to be used if basic auth is required for Jaeger.
+#jaeger_user = ""
+
+# Sets the password to be used if basic auth is required for Jaeger.
+#jaeger_password = ""
+
+# If enabled, the runtime will not create a network namespace for shim and hypervisor processes.
+# This option may have some potential impacts to your host. It should only be used when you know what you're doing.
+# `disable_new_netns` conflicts with `internetworking_model=tcfilter` and `internetworking_model=macvtap`. It works only
+# with `internetworking_model=none`. The tap device will be in the host network namespace and can connect to a bridge
+# (like OVS) directly.
+# (default: false)
+#disable_new_netns = true
+
+# if enabled, the runtime will add all the kata processes inside one dedicated cgroup.
+# The container cgroups in the host are not created, just one single cgroup per sandbox.
+# The runtime caller is free to restrict or collect cgroup stats of the overall Kata sandbox.
+# The sandbox cgroup path is the parent cgroup of a container with the PodSandbox annotation.
+# The sandbox cgroup is constrained if there is no container type annotation.
+# See: https://pkg.go.dev/github.com/kata-containers/kata-containers/src/runtime/virtcontainers#ContainerType
+sandbox_cgroup_only = @DEFSANDBOXCGROUPONLY@
+
+# If enabled, the runtime will attempt to determine appropriate sandbox size (memory, CPU) before booting the virtual machine. In
+# this case, the runtime will not dynamically update the amount of memory and CPU in the virtual machine. This is generally helpful
+# when a hardware architecture or hypervisor solutions is utilized which does not support CPU and/or memory hotplug.
+# Compatibility for determining appropriate sandbox (VM) size:
+# - When running with pods, sandbox sizing information will only be available if using Kubernetes >= 1.23 and containerd >= 1.6. CRI-O
+# does not yet support sandbox sizing annotations.
+# - When running single containers using a tool like ctr, container sizing information will be available.
+static_sandbox_resource_mgmt = @DEFSTATICRESOURCEMGMT_STRATOVIRT@
+
+# If enabled, the runtime will not create Kubernetes emptyDir mounts on the guest filesystem. Instead, emptyDir mounts will
+# be created on the host and shared via virtio-fs. This is potentially slower, but allows sharing of files from host to guest.
+disable_guest_empty_dir = @DEFDISABLEGUESTEMPTYDIR@
+
+# Enabled experimental feature list, format: ["a", "b"].
+# Experimental features are features not stable enough for production,
+# they may break compatibility, and are prepared for a big version bump.
+# Supported experimental features:
+# (default: [])
+experimental = @DEFAULTEXPFEATURES@
+
+# If enabled, user can run pprof tools with shim v2 process through kata-monitor.
+# (default: false)
+#enable_pprof = true
diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go
index feeef68..45bbfea 100644
--- a/src/runtime/pkg/katautils/config.go
+++ b/src/runtime/pkg/katautils/config.go
@@ -52,6 +52,7 @@ const (
qemuHypervisorTableType = "qemu"
acrnHypervisorTableType = "acrn"
dragonballHypervisorTableType = "dragonball"
+ stratovirtHypervisorTableType = "stratovirt"
// the maximum amount of PCI bridges that can be cold plugged in a VM
maxPCIBridges uint32 = 5
@@ -1142,6 +1143,106 @@ func newDragonballHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
}, nil
}
+func newStratovirtHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
+ hypervisor, err := h.path()
+ if err != nil {
+ return vc.HypervisorConfig{}, err
+ }
+
+ kernel, err := h.kernel()
+ if err != nil {
+ return vc.HypervisorConfig{}, err
+ }
+
+ initrd, err := h.initrd()
+ if err != nil {
+ return vc.HypervisorConfig{}, err
+ }
+
+ image, err := h.image()
+ if err != nil {
+ return vc.HypervisorConfig{}, err
+ }
+
+ if image != "" && initrd != "" {
+ return vc.HypervisorConfig{},
+ errors.New("having both an image and an initrd defined in the configuration file is not supported")
+ }
+
+ if image == "" && initrd == "" {
+ return vc.HypervisorConfig{},
+ errors.New("image or initrd must be defined in the configuration file")
+ }
+
+ rootfsType, err := h.rootfsType()
+ if err != nil {
+ return vc.HypervisorConfig{}, err
+ }
+
+ kernelParams := h.kernelParams()
+ machineType := h.machineType()
+
+ blockDriver, err := h.blockDeviceDriver()
+ if err != nil {
+ return vc.HypervisorConfig{}, err
+ }
+
+ if vSock, err := utils.SupportsVsocks(); !vSock {
+ return vc.HypervisorConfig{}, err
+ }
+
+ sharedFS, err := h.sharedFS()
+ if err != nil {
+ return vc.HypervisorConfig{}, err
+ }
+
+ if sharedFS != config.VirtioFS && sharedFS != config.VirtioFSNydus && sharedFS != config.NoSharedFS {
+ return vc.HypervisorConfig{},
+ fmt.Errorf("Stratovirt Hypervisor does not support %s shared filesystem option", sharedFS)
+ }
+
+ if (sharedFS == config.VirtioFS || sharedFS == config.VirtioFSNydus) && h.VirtioFSDaemon == "" {
+ return vc.HypervisorConfig{},
+ fmt.Errorf("cannot enable %s without daemon path in configuration file", sharedFS)
+ }
+
+ return vc.HypervisorConfig{
+ HypervisorPath: hypervisor,
+ HypervisorPathList: h.HypervisorPathList,
+ KernelPath: kernel,
+ InitrdPath: initrd,
+ ImagePath: image,
+ RootfsType: rootfsType,
+ KernelParams: vc.DeserializeParams(strings.Fields(kernelParams)),
+ HypervisorMachineType: machineType,
+ NumVCPUs: h.defaultVCPUs(),
+ DefaultMaxVCPUs: h.defaultMaxVCPUs(),
+ MemorySize: h.defaultMemSz(),
+ MemSlots: h.defaultMemSlots(),
+ MemOffset: h.defaultMemOffset(),
+ DefaultMaxMemorySize: h.defaultMaxMemSz(),
+ EntropySource: h.GetEntropySource(),
+ DefaultBridges: h.defaultBridges(),
+ DisableBlockDeviceUse: h.DisableBlockDeviceUse,
+ SharedFS: sharedFS,
+ VirtioFSDaemon: h.VirtioFSDaemon,
+ VirtioFSDaemonList: h.VirtioFSDaemonList,
+ VirtioFSCacheSize: h.VirtioFSCacheSize,
+ VirtioFSCache: h.defaultVirtioFSCache(),
+ VirtioFSExtraArgs: h.VirtioFSExtraArgs,
+ HugePages: h.HugePages,
+ Debug: h.Debug,
+ DisableNestingChecks: h.DisableNestingChecks,
+ BlockDeviceDriver: blockDriver,
+ DisableVhostNet: true,
+ GuestHookPath: h.guestHookPath(),
+ EnableAnnotations: h.EnableAnnotations,
+ DisableSeccomp: h.DisableSeccomp,
+ DisableSeLinux: h.DisableSeLinux,
+ DisableGuestSeLinux: h.DisableGuestSeLinux,
+ }, nil
+}
+
func newFactoryConfig(f factory) (oci.FactoryConfig, error) {
if f.TemplatePath == "" {
f.TemplatePath = defaultTemplatePath
@@ -1178,6 +1279,9 @@ func updateRuntimeConfigHypervisor(configPath string, tomlConf tomlConfig, confi
case dragonballHypervisorTableType:
config.HypervisorType = vc.DragonballHypervisor
hConfig, err = newDragonballHypervisorConfig(hypervisor)
+ case stratovirtHypervisorTableType:
+ config.HypervisorType = vc.StratovirtHypervisor
+ hConfig, err = newStratovirtHypervisorConfig(hypervisor)
default:
err = fmt.Errorf("%s: %+q", errInvalidHypervisorPrefix, k)
}
--
2.34.1