contrib: Introduce PoC for Kata Containers with user-mode networking
passt can be used to implement user-mode networking for the Kata Containers runtime, so that networking setup doesn't need elevated privileges or capabilities. This commit adds the patch for Kata Containers runtime and agent to support passt as networking model and endpoint, and some basic documentation. See contrib/kata-containers/README.md for more details and setup steps. Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
This commit is contained in:
parent
27050b094f
commit
20d271b226
2 changed files with 764 additions and 0 deletions
|
@ -0,0 +1,462 @@
|
||||||
|
From e1b250fc0b5e377285db5d90476fdd2d63501191 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Stefano Brivio <sbrivio@redhat.com>
|
||||||
|
Date: Fri, 28 Jan 2022 01:09:23 +0100
|
||||||
|
Subject: [PATCH] virtcontainers, agent: Add passt networking model and
|
||||||
|
endpoint
|
||||||
|
|
||||||
|
This implements a draft support for user-mode networking using
|
||||||
|
passt (https://passt.top), the corresponding networking model
|
||||||
|
can be enabled via:
|
||||||
|
|
||||||
|
internetworking_model=passt
|
||||||
|
|
||||||
|
in the [runtime] section of the TOML configuration file.
|
||||||
|
|
||||||
|
The networking endpoint does essentially nothing, other than
|
||||||
|
starting and stopping passt as needed: no interfaces are configured,
|
||||||
|
qemu connects to passt via UNIX domain socket, the corresponding
|
||||||
|
command line option is appended if this networking model is
|
||||||
|
selected.
|
||||||
|
|
||||||
|
The passt instance started by the endpoint take cares of forwarding
|
||||||
|
traffic back and forth, translating between the L2 frames qemu-side
|
||||||
|
and native L4 sockets on the host.
|
||||||
|
|
||||||
|
This network setup doesn't need elevated privileges or any kind of
|
||||||
|
capability. However, this patch doesn't implement privileges drop
|
||||||
|
as the containerd interface allows only runtimes running as the
|
||||||
|
same user to connect to its own UNIX domain socket interface,
|
||||||
|
typically root (at least in the case of CRI-O), and root privileges
|
||||||
|
might anyway be needed for other purposes (block devices, etc.)
|
||||||
|
|
||||||
|
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
|
||||||
|
---
|
||||||
|
SPDX-FileCopyrightText: 2021-2022 Red Hat GmbH <sbrivio@redhat.com>
|
||||||
|
SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
src/agent/src/netlink.rs | 3 +-
|
||||||
|
.../kata-containers/govmm/qemu/qemu.go | 23 ++-
|
||||||
|
src/runtime/virtcontainers/endpoint.go | 7 +
|
||||||
|
src/runtime/virtcontainers/network.go | 24 +++
|
||||||
|
src/runtime/virtcontainers/passt_endpoint.go | 156 ++++++++++++++++++
|
||||||
|
.../virtcontainers/persist/api/network.go | 5 +
|
||||||
|
src/runtime/virtcontainers/qemu_arch_base.go | 11 ++
|
||||||
|
7 files changed, 226 insertions(+), 3 deletions(-)
|
||||||
|
create mode 100644 src/runtime/virtcontainers/passt_endpoint.go
|
||||||
|
|
||||||
|
diff --git a/src/agent/src/netlink.rs b/src/agent/src/netlink.rs
|
||||||
|
index ed071b60..34c6df96 100644
|
||||||
|
--- a/src/agent/src/netlink.rs
|
||||||
|
+++ b/src/agent/src/netlink.rs
|
||||||
|
@@ -312,7 +312,8 @@ impl Handle {
|
||||||
|
let list = a.iter().chain(&b);
|
||||||
|
|
||||||
|
for route in list {
|
||||||
|
- let link = self.find_link(LinkFilter::Name(&route.device)).await?;
|
||||||
|
+ // TODO: "eth0" hardcoded for passt networking model
|
||||||
|
+ let link = self.find_link(LinkFilter::Name("eth0")).await?;
|
||||||
|
|
||||||
|
const MAIN_TABLE: u8 = packet::constants::RT_TABLE_MAIN;
|
||||||
|
const UNICAST: u8 = packet::constants::RTN_UNICAST;
|
||||||
|
diff --git a/src/runtime/vendor/github.com/kata-containers/govmm/qemu/qemu.go b/src/runtime/vendor/github.com/kata-containers/govmm/qemu/qemu.go
|
||||||
|
index e57a4b26..1756bdfd 100644
|
||||||
|
--- a/src/runtime/vendor/github.com/kata-containers/govmm/qemu/qemu.go
|
||||||
|
+++ b/src/runtime/vendor/github.com/kata-containers/govmm/qemu/qemu.go
|
||||||
|
@@ -682,6 +682,8 @@ const (
|
||||||
|
|
||||||
|
// VHOSTUSER is a vhost-user port (socket)
|
||||||
|
VHOSTUSER NetDeviceType = "vhostuser"
|
||||||
|
+
|
||||||
|
+ PASST NetDeviceType = "passt"
|
||||||
|
)
|
||||||
|
|
||||||
|
// QemuNetdevParam converts to the QEMU -netdev parameter notation
|
||||||
|
@@ -709,6 +711,8 @@ func (n NetDeviceType) QemuNetdevParam(netdev *NetDevice, config *Config) string
|
||||||
|
log.Fatal("vhost-user devices are not supported on IBM Z")
|
||||||
|
}
|
||||||
|
return "vhost-user" // -netdev type=vhost-user (no device)
|
||||||
|
+ case PASST:
|
||||||
|
+ return "socket" // -netdev type=socket,connect=...
|
||||||
|
default:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
@@ -742,6 +746,8 @@ func (n NetDeviceType) QemuDeviceParam(netdev *NetDevice, config *Config) Device
|
||||||
|
log.Fatal("vhost-user devices are not supported on IBM Z")
|
||||||
|
}
|
||||||
|
return "" // -netdev type=vhost-user (no device)
|
||||||
|
+ case PASST:
|
||||||
|
+ device = "virtio-net"
|
||||||
|
default:
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
@@ -806,6 +812,8 @@ type NetDevice struct {
|
||||||
|
|
||||||
|
// Transport is the virtio transport for this device.
|
||||||
|
Transport VirtioTransport
|
||||||
|
+
|
||||||
|
+ SocketPath string
|
||||||
|
}
|
||||||
|
|
||||||
|
// VirtioNetTransport is a map of the virtio-net device name that corresponds
|
||||||
|
@@ -818,6 +826,10 @@ var VirtioNetTransport = map[VirtioTransport]string{
|
||||||
|
|
||||||
|
// Valid returns true if the NetDevice structure is valid and complete.
|
||||||
|
func (netdev NetDevice) Valid() bool {
|
||||||
|
+ if netdev.Type == PASST {
|
||||||
|
+ return true
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
if netdev.ID == "" || netdev.IFName == "" {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
@@ -867,7 +879,9 @@ func (netdev NetDevice) QemuDeviceParams(config *Config) []string {
|
||||||
|
|
||||||
|
deviceParams = append(deviceParams, fmt.Sprintf("driver=%s", driver))
|
||||||
|
deviceParams = append(deviceParams, fmt.Sprintf("netdev=%s", netdev.ID))
|
||||||
|
- deviceParams = append(deviceParams, fmt.Sprintf("mac=%s", netdev.MACAddress))
|
||||||
|
+ if netdev.MACAddress != "" {
|
||||||
|
+ deviceParams = append(deviceParams, fmt.Sprintf("mac=%s", netdev.MACAddress))
|
||||||
|
+ }
|
||||||
|
|
||||||
|
if netdev.Bus != "" {
|
||||||
|
deviceParams = append(deviceParams, fmt.Sprintf("bus=%s", netdev.Bus))
|
||||||
|
@@ -937,7 +951,12 @@ func (netdev NetDevice) QemuNetdevParams(config *Config) []string {
|
||||||
|
netdevParams = append(netdevParams, fmt.Sprintf("fds=%s", strings.Join(fdParams, ":")))
|
||||||
|
|
||||||
|
} else {
|
||||||
|
- netdevParams = append(netdevParams, fmt.Sprintf("ifname=%s", netdev.IFName))
|
||||||
|
+ if netdev.IFName != "" {
|
||||||
|
+ netdevParams = append(netdevParams, fmt.Sprintf("ifname=%s", netdev.IFName))
|
||||||
|
+ }
|
||||||
|
+ if netdev.SocketPath != "" {
|
||||||
|
+ netdevParams = append(netdevParams, fmt.Sprintf("connect=%s", netdev.SocketPath))
|
||||||
|
+ }
|
||||||
|
if netdev.DownScript != "" {
|
||||||
|
netdevParams = append(netdevParams, fmt.Sprintf("downscript=%s", netdev.DownScript))
|
||||||
|
}
|
||||||
|
diff --git a/src/runtime/virtcontainers/endpoint.go b/src/runtime/virtcontainers/endpoint.go
|
||||||
|
index 7786bb3e..e167304a 100644
|
||||||
|
--- a/src/runtime/virtcontainers/endpoint.go
|
||||||
|
+++ b/src/runtime/virtcontainers/endpoint.go
|
||||||
|
@@ -65,6 +65,8 @@ const (
|
||||||
|
|
||||||
|
// IPVlanEndpointType is ipvlan network interface.
|
||||||
|
IPVlanEndpointType EndpointType = "ipvlan"
|
||||||
|
+
|
||||||
|
+ PasstEndpointType EndpointType = "passt"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Set sets an endpoint type based on the input string.
|
||||||
|
@@ -94,6 +96,9 @@ func (endpointType *EndpointType) Set(value string) error {
|
||||||
|
case "ipvlan":
|
||||||
|
*endpointType = IPVlanEndpointType
|
||||||
|
return nil
|
||||||
|
+ case "passt":
|
||||||
|
+ *endpointType = PasstEndpointType
|
||||||
|
+ return nil
|
||||||
|
default:
|
||||||
|
return fmt.Errorf("Unknown endpoint type %s", value)
|
||||||
|
}
|
||||||
|
@@ -118,6 +123,8 @@ func (endpointType *EndpointType) String() string {
|
||||||
|
return string(TuntapEndpointType)
|
||||||
|
case IPVlanEndpointType:
|
||||||
|
return string(IPVlanEndpointType)
|
||||||
|
+ case PasstEndpointType:
|
||||||
|
+ return string(PasstEndpointType)
|
||||||
|
default:
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
diff --git a/src/runtime/virtcontainers/network.go b/src/runtime/virtcontainers/network.go
|
||||||
|
index e6c681da..2de692fe 100644
|
||||||
|
--- a/src/runtime/virtcontainers/network.go
|
||||||
|
+++ b/src/runtime/virtcontainers/network.go
|
||||||
|
@@ -57,6 +57,9 @@ const (
|
||||||
|
// NetXConnectNoneModel can be used when the VM is in the host network namespace
|
||||||
|
NetXConnectNoneModel
|
||||||
|
|
||||||
|
+ // passt in namespace connecting hypervisor via host sockets
|
||||||
|
+ NetXConnectPasstModel
|
||||||
|
+
|
||||||
|
// NetXConnectInvalidModel is the last item to Check valid values by IsValid()
|
||||||
|
NetXConnectInvalidModel
|
||||||
|
)
|
||||||
|
@@ -73,6 +76,8 @@ const (
|
||||||
|
|
||||||
|
tcFilterNetModelStr = "tcfilter"
|
||||||
|
|
||||||
|
+ passtNetModelStr = "passt"
|
||||||
|
+
|
||||||
|
noneNetModelStr = "none"
|
||||||
|
)
|
||||||
|
|
||||||
|
@@ -85,6 +90,8 @@ func (n *NetInterworkingModel) GetModel() string {
|
||||||
|
return macvtapNetModelStr
|
||||||
|
case NetXConnectTCFilterModel:
|
||||||
|
return tcFilterNetModelStr
|
||||||
|
+ case NetXConnectPasstModel:
|
||||||
|
+ return passtNetModelStr
|
||||||
|
case NetXConnectNoneModel:
|
||||||
|
return noneNetModelStr
|
||||||
|
}
|
||||||
|
@@ -103,6 +110,9 @@ func (n *NetInterworkingModel) SetModel(modelName string) error {
|
||||||
|
case tcFilterNetModelStr:
|
||||||
|
*n = NetXConnectTCFilterModel
|
||||||
|
return nil
|
||||||
|
+ case passtNetModelStr:
|
||||||
|
+ *n = NetXConnectPasstModel
|
||||||
|
+ return nil
|
||||||
|
case noneNetModelStr:
|
||||||
|
*n = NetXConnectNoneModel
|
||||||
|
return nil
|
||||||
|
@@ -254,6 +264,8 @@ func getLinkForEndpoint(endpoint Endpoint, netHandle *netlink.Handle) (netlink.L
|
||||||
|
link = &netlink.IPVlan{}
|
||||||
|
case *TuntapEndpoint:
|
||||||
|
link = &netlink.Tuntap{}
|
||||||
|
+ case *PasstEndpoint:
|
||||||
|
+ return nil, nil
|
||||||
|
default:
|
||||||
|
return nil, fmt.Errorf("Unexpected endpointType %s", ep.Type())
|
||||||
|
}
|
||||||
|
@@ -302,6 +314,11 @@ func xConnectVMNetwork(ctx context.Context, endpoint Endpoint, h Hypervisor) err
|
||||||
|
span, ctx := networkTrace(ctx, "xConnectVMNetwork", endpoint)
|
||||||
|
defer closeSpan(span, err)
|
||||||
|
|
||||||
|
+ if endpoint.Type() == PasstEndpointType {
|
||||||
|
+ networkLogger().Info("VM network via passt user-mode networking")
|
||||||
|
+ return nil
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
netPair := endpoint.NetworkPair()
|
||||||
|
|
||||||
|
queues := 0
|
||||||
|
@@ -347,6 +364,7 @@ func xDisconnectVMNetwork(ctx context.Context, endpoint Endpoint) error {
|
||||||
|
err = untapNetworkPair(ctx, endpoint)
|
||||||
|
case NetXConnectTCFilterModel:
|
||||||
|
err = removeTCFiltering(ctx, endpoint)
|
||||||
|
+ case NetXConnectPasstModel:
|
||||||
|
default:
|
||||||
|
err = fmt.Errorf("Invalid internetworking model")
|
||||||
|
}
|
||||||
|
@@ -1095,6 +1113,12 @@ func createEndpoint(netInfo NetworkInfo, idx int, model NetInterworkingModel, li
|
||||||
|
// an appropriate EndPoint based on interface type
|
||||||
|
// This should be a switch
|
||||||
|
|
||||||
|
+ if model == NetXConnectPasstModel {
|
||||||
|
+ networkLogger().Info("creating passt endpoint")
|
||||||
|
+ endpoint, err := createPasstNetworkEndpoint(idx)
|
||||||
|
+ return endpoint, err
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
// Check if interface is a physical interface. Do not create
|
||||||
|
// tap interface/bridge if it is.
|
||||||
|
isPhysical, err := isPhysicalIface(netInfo.Iface.Name)
|
||||||
|
diff --git a/src/runtime/virtcontainers/passt_endpoint.go b/src/runtime/virtcontainers/passt_endpoint.go
|
||||||
|
new file mode 100644
|
||||||
|
index 00000000..7f40135a
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/src/runtime/virtcontainers/passt_endpoint.go
|
||||||
|
@@ -0,0 +1,156 @@
|
||||||
|
+// SPDX-License-Identifier: Apache-2.0
|
||||||
|
+//
|
||||||
|
+// passt_endpoint.go - passt endpoint for Kata Containers: start and stop passt
|
||||||
|
+//
|
||||||
|
+// Copyright (c) 2021-2022 Red Hat GmbH
|
||||||
|
+// Author: Stefano Brivio <sbrivio@redhat.com>
|
||||||
|
+
|
||||||
|
+package virtcontainers
|
||||||
|
+
|
||||||
|
+import (
|
||||||
|
+ "context"
|
||||||
|
+ "fmt"
|
||||||
|
+ "os"
|
||||||
|
+ "os/exec"
|
||||||
|
+ "syscall"
|
||||||
|
+
|
||||||
|
+ persistapi "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/persist/api"
|
||||||
|
+ vcTypes "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types"
|
||||||
|
+)
|
||||||
|
+
|
||||||
|
+type PasstEndpoint struct {
|
||||||
|
+ EndpointType EndpointType
|
||||||
|
+ EndpointProperties NetworkInfo
|
||||||
|
+ PCIPath vcTypes.PciPath
|
||||||
|
+ PasstPID int
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+func createPasstNetworkEndpoint(idx int) (*PasstEndpoint, error) {
|
||||||
|
+ if idx < 0 {
|
||||||
|
+ return &PasstEndpoint{}, fmt.Errorf("invalid network endpoint index: %d", idx)
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ cmd := exec.Command("passt",
|
||||||
|
+ "-P", fmt.Sprintf("/tmp/kata-passt-%d.pid", idx),
|
||||||
|
+ "-s", fmt.Sprintf("/tmp/kata-passt-%d.socket", idx))
|
||||||
|
+ err := cmd.Run()
|
||||||
|
+ if err != nil {
|
||||||
|
+ return &PasstEndpoint{}, fmt.Errorf("passt failed to start: %v", err)
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ in, err := os.Open(fmt.Sprintf("/tmp/kata-passt-%d.pid", idx))
|
||||||
|
+ if err != nil {
|
||||||
|
+ return &PasstEndpoint{}, fmt.Errorf("Failed to read passt PID: %v", err)
|
||||||
|
+ }
|
||||||
|
+ defer in.Close()
|
||||||
|
+
|
||||||
|
+ var pid int
|
||||||
|
+ _, err = fmt.Fscanf(in, "%d", &pid)
|
||||||
|
+ if err != nil {
|
||||||
|
+ return &PasstEndpoint{}, fmt.Errorf("Failed to read passt pid: %v", err)
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ endpoint := &PasstEndpoint{
|
||||||
|
+ EndpointType: PasstEndpointType,
|
||||||
|
+ PasstPID: pid,
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ return endpoint, nil
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+func (endpoint *PasstEndpoint) Properties() NetworkInfo {
|
||||||
|
+ return endpoint.EndpointProperties
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+func (endpoint *PasstEndpoint) Type() EndpointType {
|
||||||
|
+ return endpoint.EndpointType
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+// unsupported
|
||||||
|
+func (endpoint *PasstEndpoint) HardwareAddr() string {
|
||||||
|
+ return "00:11:22:33:44:55"
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+// unsupported
|
||||||
|
+func (endpoint *PasstEndpoint) Name() string {
|
||||||
|
+ return ""
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+// unsupported
|
||||||
|
+func (endpoint *PasstEndpoint) NetworkPair() *NetworkInterfacePair {
|
||||||
|
+ return nil
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+// PciPath returns the PCI path of the endpoint.
|
||||||
|
+func (endpoint *PasstEndpoint) PciPath() vcTypes.PciPath {
|
||||||
|
+ return endpoint.PCIPath
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+// useless
|
||||||
|
+func (endpoint *PasstEndpoint) SetPciPath(pciPath vcTypes.PciPath) {
|
||||||
|
+ endpoint.PCIPath = pciPath
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+func (endpoint *PasstEndpoint) SetProperties(properties NetworkInfo) {
|
||||||
|
+ endpoint.EndpointProperties = properties
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+func (endpoint *PasstEndpoint) Attach(ctx context.Context, s *Sandbox) error {
|
||||||
|
+ h := s.hypervisor
|
||||||
|
+ if err := xConnectVMNetwork(ctx, endpoint, h); err != nil {
|
||||||
|
+ networkLogger().WithError(err).Error("Error attaching passt endpoint")
|
||||||
|
+ return err
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ return h.AddDevice(ctx, endpoint, NetDev)
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+func (endpoint *PasstEndpoint) Detach(ctx context.Context, netNsCreated bool, netNsPath string) error {
|
||||||
|
+ syscall.Kill(endpoint.PasstPID, syscall.SIGQUIT)
|
||||||
|
+
|
||||||
|
+ return nil
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+func (endpoint *PasstEndpoint) HotAttach(ctx context.Context, h Hypervisor) error {
|
||||||
|
+ return fmt.Errorf("HotAttach not supported by PasstEndpoint")
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+func (endpoint *PasstEndpoint) HotDetach(ctx context.Context, h Hypervisor, netNsCreated bool, netNsPath string) error {
|
||||||
|
+ return fmt.Errorf("HotDetatch not supported by PasstEndpoint")
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+func (endpoint *PasstEndpoint) save() persistapi.NetworkEndpoint {
|
||||||
|
+ return persistapi.NetworkEndpoint{
|
||||||
|
+ Type: string(endpoint.Type()),
|
||||||
|
+
|
||||||
|
+ Passt: &persistapi.PasstEndpoint{
|
||||||
|
+ PasstPID: endpoint.PasstPID,
|
||||||
|
+ },
|
||||||
|
+ }
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+func (endpoint *PasstEndpoint) load(s persistapi.NetworkEndpoint) {
|
||||||
|
+ endpoint.EndpointType = PasstEndpointType
|
||||||
|
+
|
||||||
|
+ if s.Passt != nil {
|
||||||
|
+ endpoint.PasstPID = s.Passt.PasstPID
|
||||||
|
+ }
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+// unsupported
|
||||||
|
+func (endpoint *PasstEndpoint) GetRxRateLimiter() bool {
|
||||||
|
+ return false
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+func (endpoint *PasstEndpoint) SetRxRateLimiter() error {
|
||||||
|
+ return fmt.Errorf("rx rate limiter is unsupported for physical endpoint")
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+// unsupported
|
||||||
|
+func (endpoint *PasstEndpoint) GetTxRateLimiter() bool {
|
||||||
|
+ return false
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+func (endpoint *PasstEndpoint) SetTxRateLimiter() error {
|
||||||
|
+ return fmt.Errorf("tx rate limiter is unsupported for physical endpoint")
|
||||||
|
+}
|
||||||
|
diff --git a/src/runtime/virtcontainers/persist/api/network.go b/src/runtime/virtcontainers/persist/api/network.go
|
||||||
|
index 51c3aac6..79d77cd9 100644
|
||||||
|
--- a/src/runtime/virtcontainers/persist/api/network.go
|
||||||
|
+++ b/src/runtime/virtcontainers/persist/api/network.go
|
||||||
|
@@ -79,6 +79,10 @@ type VhostUserEndpoint struct {
|
||||||
|
PCIPath vcTypes.PciPath
|
||||||
|
}
|
||||||
|
|
||||||
|
+type PasstEndpoint struct {
|
||||||
|
+ PasstPID int
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
// NetworkEndpoint contains network interface information
|
||||||
|
type NetworkEndpoint struct {
|
||||||
|
// One and only one of these below are not nil according to Type.
|
||||||
|
@@ -90,6 +94,7 @@ type NetworkEndpoint struct {
|
||||||
|
Tap *TapEndpoint `json:",omitempty"`
|
||||||
|
IPVlan *IPVlanEndpoint `json:",omitempty"`
|
||||||
|
Tuntap *TuntapEndpoint `json:",omitempty"`
|
||||||
|
+ Passt *PasstEndpoint `json:",omitempty"`
|
||||||
|
|
||||||
|
Type string
|
||||||
|
}
|
||||||
|
diff --git a/src/runtime/virtcontainers/qemu_arch_base.go b/src/runtime/virtcontainers/qemu_arch_base.go
|
||||||
|
index 97cd6eb8..9ace0ace 100644
|
||||||
|
--- a/src/runtime/virtcontainers/qemu_arch_base.go
|
||||||
|
+++ b/src/runtime/virtcontainers/qemu_arch_base.go
|
||||||
|
@@ -615,6 +615,17 @@ func genericNetwork(endpoint Endpoint, vhost, nestedRun bool, index int) (govmmQ
|
||||||
|
FDs: netPair.VMFds,
|
||||||
|
VhostFDs: netPair.VhostFds,
|
||||||
|
}
|
||||||
|
+ case *PasstEndpoint:
|
||||||
|
+ d = govmmQemu.NetDevice{
|
||||||
|
+ Type: govmmQemu.PASST,
|
||||||
|
+ Driver: govmmQemu.VirtioNet,
|
||||||
|
+ ID: fmt.Sprintf("network-%d", index),
|
||||||
|
+ // TODO: Drop hardcoded MAC address, passt endpoint
|
||||||
|
+ // doesn't need to know it
|
||||||
|
+ MACAddress: "00:11:22:33:44:55",
|
||||||
|
+ DisableModern: nestedRun,
|
||||||
|
+ SocketPath: fmt.Sprintf("/tmp/kata-passt-%d.socket", index),
|
||||||
|
+ }
|
||||||
|
default:
|
||||||
|
return govmmQemu.NetDevice{}, fmt.Errorf("Unknown type for endpoint")
|
||||||
|
}
|
||||||
|
--
|
||||||
|
2.28.0
|
||||||
|
|
302
contrib/kata-containers/README.md
Normal file
302
contrib/kata-containers/README.md
Normal file
|
@ -0,0 +1,302 @@
|
||||||
|
This document shows how to set up a Kata Containers environment using passt to
|
||||||
|
implement user-mode networking: contrary to other networking models currently
|
||||||
|
implemented, this kind of setup requires no elevated privileges or capabilities
|
||||||
|
as far as networking is concerned.
|
||||||
|
|
||||||
|
This proof-of-concept uses CRI-O as implementation container runtime, which is
|
||||||
|
controlled directly without resorting to a full Kubernetes environment.
|
||||||
|
|
||||||
|
# Pre-requisites
|
||||||
|
|
||||||
|
* Go and rust toolchains, typically provided by distribution packages
|
||||||
|
* the usual tools, such as git, make, etc.
|
||||||
|
* a 4.x qemu version, or more recent, with a working virtiofsd executable
|
||||||
|
(provided at least by Debian, Ubuntu, Fedora packages)
|
||||||
|
|
||||||
|
# Fetch and prepare components
|
||||||
|
|
||||||
|
## CRI-O
|
||||||
|
|
||||||
|
CRI-O is the container runtime. It implements the Kubernetes CRI (Container
|
||||||
|
Runtime Interface) on one side -- and we'll handle that part manually with
|
||||||
|
`crictl` here, and on the other side it supports OCI (Open Container Initiative)
|
||||||
|
runtimes -- Kata Containers is one of them.
|
||||||
|
|
||||||
|
### Fetch
|
||||||
|
|
||||||
|
git clone https://github.com/cri-o/cri-o.git
|
||||||
|
|
||||||
|
### Build
|
||||||
|
|
||||||
|
cd cri-o
|
||||||
|
make
|
||||||
|
|
||||||
|
### Install
|
||||||
|
|
||||||
|
As root:
|
||||||
|
|
||||||
|
make install
|
||||||
|
|
||||||
|
### Configure
|
||||||
|
|
||||||
|
Configuration is now at `/etc/crio/crio.conf`. This would also be the case for
|
||||||
|
distribution packages. Some specific configuration items for Kata Containers
|
||||||
|
are:
|
||||||
|
|
||||||
|
# Cgroup management implementation used for the runtime.
|
||||||
|
cgroup_manager = "cgroupfs"
|
||||||
|
|
||||||
|
# manage_ns_lifecycle determines whether we pin and remove namespaces
|
||||||
|
# and manage their lifecycle
|
||||||
|
manage_ns_lifecycle = true
|
||||||
|
|
||||||
|
and the following section, that can be added at the end, defines a special type
|
||||||
|
of runtime, the `vm` type. This is needed to run the Kata Containers runtime
|
||||||
|
instead of the default `crun` choice:
|
||||||
|
|
||||||
|
[crio.runtime.runtimes.kata]
|
||||||
|
runtime_path = "/usr/local/bin/containerd-shim-kata-v2"
|
||||||
|
runtime_type = "vm"
|
||||||
|
runtime_root = "/run/vc"
|
||||||
|
|
||||||
|
Note that we don't have a containerd-shim-kata-v2 binary yet, we'll deal with
|
||||||
|
that in the next steps.
|
||||||
|
|
||||||
|
## CNI plugins
|
||||||
|
|
||||||
|
CNI plugins are actually binaries, run by CRI-O, used to configure networking on
|
||||||
|
the host as well as on the pod side. A few network topologies are offered, with
|
||||||
|
very limited capabilities.
|
||||||
|
|
||||||
|
### Fetch
|
||||||
|
|
||||||
|
git clone https://github.com/containernetworking/plugins
|
||||||
|
|
||||||
|
### Build
|
||||||
|
|
||||||
|
cd plugins
|
||||||
|
./build_linux.sh
|
||||||
|
|
||||||
|
### Install
|
||||||
|
|
||||||
|
As root:
|
||||||
|
|
||||||
|
mkdir -p /opt/cni/bin
|
||||||
|
cp bin/* /opt/cni/bin/
|
||||||
|
|
||||||
|
|
||||||
|
### Configure
|
||||||
|
|
||||||
|
The path where CNI configurations are located is configurable in
|
||||||
|
`/etc/crio/crio.conf`, see the `network_dir` parameter there. Assuming the
|
||||||
|
default value, we need to provide at least one configuration under
|
||||||
|
`/etc/cni/net.d/`. For example:
|
||||||
|
|
||||||
|
# cat /etc/cni/net.d/50-kata-sandbox.conf
|
||||||
|
{
|
||||||
|
"cniVersion": "0.3.0",
|
||||||
|
"name": "crio-bridge",
|
||||||
|
"type": "bridge",
|
||||||
|
"bridge": "cni0",
|
||||||
|
"isGateway": true,
|
||||||
|
"ipMasq": true,
|
||||||
|
"ipam": {
|
||||||
|
"type": "host-local",
|
||||||
|
"subnet": "10.88.0.0/16",
|
||||||
|
"routes": [
|
||||||
|
{ "dst": "0.0.0.0/0" }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
## crictl
|
||||||
|
|
||||||
|
`crictl` is needed to control CRI-O in lieu of Kubernetes.
|
||||||
|
|
||||||
|
### Fetch
|
||||||
|
|
||||||
|
git clone https://github.com/kubernetes-sigs/cri-tools.git
|
||||||
|
|
||||||
|
### Build
|
||||||
|
|
||||||
|
cd cri-tools
|
||||||
|
make
|
||||||
|
|
||||||
|
### Install
|
||||||
|
|
||||||
|
As root:
|
||||||
|
|
||||||
|
make install
|
||||||
|
|
||||||
|
## mbuto
|
||||||
|
|
||||||
|
We'll use `mbuto` to build a minimal virtual machine image for usage with the
|
||||||
|
Kata Containers runtime.
|
||||||
|
|
||||||
|
### Fetch
|
||||||
|
|
||||||
|
git clone https://mbuto.lameexcu.se/mbuto
|
||||||
|
|
||||||
|
## Kata Containers
|
||||||
|
|
||||||
|
### Fetch
|
||||||
|
|
||||||
|
git clone https://github.com/kata-containers/kata-containers
|
||||||
|
|
||||||
|
### Patch
|
||||||
|
|
||||||
|
The current upstream version doesn't support the _passt_ networking model yet,
|
||||||
|
use the patch from this directory to add it:
|
||||||
|
|
||||||
|
patch -p1 < 0001-virtcontainers-agent-Add-passt-networking-model-and-.patch
|
||||||
|
|
||||||
|
### Build
|
||||||
|
|
||||||
|
make -C src/runtime
|
||||||
|
make -C src/agent LIBC=gnu
|
||||||
|
|
||||||
|
### Install
|
||||||
|
|
||||||
|
As root:
|
||||||
|
|
||||||
|
make -C src/runtime install
|
||||||
|
cp src/agent/target/x86_64-unknown-linux-gnu/release/kata-agent /usr/libexec/
|
||||||
|
chmod 755 /usr/libexec/kata-agent
|
||||||
|
|
||||||
|
### Build the Virtual Machine image
|
||||||
|
|
||||||
|
cd mbuto
|
||||||
|
./mbuto -f /tmp/kata.img
|
||||||
|
|
||||||
|
See `mbuto -h` for additional parameters, such as choice of kernel version,
|
||||||
|
kernel modules, program add-ons, etc. `mbuto` will print some configuration
|
||||||
|
parameters to be used in the configuration of the Kata Containers runtime below.
|
||||||
|
For example:
|
||||||
|
|
||||||
|
$ ./mbuto -c lz4 -f /tmp/kata.img
|
||||||
|
Not running as root, won't keep cpio mounted
|
||||||
|
Size: bin 12M lib 59M kmod 1.4M total 70M compressed 33M
|
||||||
|
Kata Containers [hypervisor.qemu] configuration:
|
||||||
|
|
||||||
|
kernel = "/boot/vmlinuz-5.10.0-6-amd64"
|
||||||
|
initrd = "/tmp/kata.img"
|
||||||
|
|
||||||
|
### Configure
|
||||||
|
|
||||||
|
The configuration file at this point is located at
|
||||||
|
`/usr/share/defaults/kata-containers/configuration-qemu.toml`. Some parameters of general interest are:
|
||||||
|
|
||||||
|
[hypervisor.qemu]
|
||||||
|
kernel = "/boot/vmlinuz-5.10.0-6-amd64"
|
||||||
|
initrd = "/tmp/kata.img"
|
||||||
|
|
||||||
|
where we can use the values indicated earlier by `mbuto`. Currently, the default
|
||||||
|
path for the `virtiofsd` daemon doesn't work for all distributions, ensure that
|
||||||
|
it matches. For example, on Debian:
|
||||||
|
|
||||||
|
virtio_fs_daemon = "/usr/lib/qemu/virtiofsd"
|
||||||
|
|
||||||
|
we'll then need to enable the `passt` networking model for the runtime. In the
|
||||||
|
`[runtime]` section:
|
||||||
|
|
||||||
|
internetworking_model=passt
|
||||||
|
|
||||||
|
# Run an example container
|
||||||
|
|
||||||
|
## Fetch
|
||||||
|
|
||||||
|
We'll now need an image of a container to run as example. With `podman`
|
||||||
|
installed via distribution package, we can import one:
|
||||||
|
|
||||||
|
podman pull docker.io/i386/busybox
|
||||||
|
|
||||||
|
## Configure
|
||||||
|
|
||||||
|
Now we can define configuration files for pod and container we want to create
|
||||||
|
and start:
|
||||||
|
|
||||||
|
$ cat pod-config.json
|
||||||
|
{
|
||||||
|
"metadata": {
|
||||||
|
"name": "kata-sandbox",
|
||||||
|
"namespace": "default",
|
||||||
|
"attempt": 1,
|
||||||
|
"uid": "hdishd83djaidwnduwk28bcsb"
|
||||||
|
},
|
||||||
|
"logDirectory": "/tmp",
|
||||||
|
"linux": {
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
$ cat container-busybox.json
|
||||||
|
{
|
||||||
|
"metadata": {
|
||||||
|
"name": "kata-busybox"
|
||||||
|
},
|
||||||
|
"image": {
|
||||||
|
"image": "docker.io/i386/busybox"
|
||||||
|
},
|
||||||
|
"command": [
|
||||||
|
"sleep", "6000"
|
||||||
|
],
|
||||||
|
"log_path":"kata-busybox.log",
|
||||||
|
"linux": {
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
## Run the container workload
|
||||||
|
|
||||||
|
Assuming we have `pod-config.json` and `container-busybox.json` defined above,
|
||||||
|
we can now:
|
||||||
|
|
||||||
|
### start CRI-O
|
||||||
|
|
||||||
|
crio -l debug
|
||||||
|
|
||||||
|
### create the pod and run a container inside it
|
||||||
|
|
||||||
|
c=$(crictl start $(crictl create $(crictl runp --runtime=kata pod-config.json) container-dpdk.json pod-config.json))
|
||||||
|
|
||||||
|
### verify that addresses are properly configured
|
||||||
|
|
||||||
|
crictl exec $c ip ad sh
|
||||||
|
|
||||||
|
## Enable support for ICMP/ICMPv6 Echo Request
|
||||||
|
|
||||||
|
_passt_ can replicate ICMP Echo Requests sent by the workload, and propagate the
|
||||||
|
replies back. However, as it's not running as root, we need to enable so-called
|
||||||
|
_ping_ sockets for unprivileged users. From the namespace created by CRI-O for
|
||||||
|
this container:
|
||||||
|
|
||||||
|
sysctl -w net.ipv4.ping_group_range=net.ipv4.ping_group_range = 0 2147483647
|
||||||
|
|
||||||
|
# Troubleshooting
|
||||||
|
|
||||||
|
## Redirect qemu's console output to file
|
||||||
|
|
||||||
|
Agent errors and kernel messages should be accessible via named UNIX domain
|
||||||
|
socket at `/run/vc/vm/*/console.sock`, provided `agent.debug_console` is enabled
|
||||||
|
in `kernel_params` of `configuration.toml` but this won't work if the agent
|
||||||
|
doesn't start. In order to get those, we can wrap `qemu` and get, additionally,
|
||||||
|
all the output piped to a file:
|
||||||
|
|
||||||
|
$ cat /usr/local/bin/qemu.sh
|
||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
/usr/bin/qemu-system-x86_64 "$@" -serial file:/tmp/qemu.log 2>/tmp/qemu_err.log
|
||||||
|
|
||||||
|
now, use this as path for `qemu` in `configuration.toml`:
|
||||||
|
|
||||||
|
[hypervisor.qemu]
|
||||||
|
path = "/usr/local/bin/qemu.sh"
|
||||||
|
|
||||||
|
and don't forget to add `console=ttyS0` to the kernel parameters, so that kernel
|
||||||
|
messages will also be included:
|
||||||
|
|
||||||
|
kernel_params = "... console=ttyS0"
|
||||||
|
|
||||||
|
## Debug console
|
||||||
|
|
||||||
|
See the `kata-console` script in the
|
||||||
|
[kata-vfio-tools repository](https://github.com/dgibson/kata-vfio-tools) for a
|
||||||
|
convenient helper to access the debug console provided by the agent.
|
Loading…
Reference in a new issue