diff --git a/CODEOWNERS b/CODEOWNERS index c47456c..1f5f84c 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1 +1 @@ -* @crosbymichael @cyphar @dqminh @giuseppe @hqhq @mrunalp @tianon @vbatts +* @AkihiroSuda @crosbymichael @cyphar @dqminh @giuseppe @hqhq @kolyshkin @mrunalp @thaJeztah @tianon @vbatts diff --git a/GOVERNANCE.md b/GOVERNANCE.md index 92c8609..c780b12 100644 --- a/GOVERNANCE.md +++ b/GOVERNANCE.md @@ -67,4 +67,4 @@ > [runtime-spec adopted]: Tag 0647920 as 1.0.0-rc (+6 -0 #3) -[charter]: https://www.opencontainers.org/about/governance +[charter]: https://github.com/opencontainers/tob/blob/main/CHARTER.md diff --git a/MAINTAINERS b/MAINTAINERS index c4bb67d..dc3b7e0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6,3 +6,6 @@ Qiang Huang (@hqhq) Aleksa Sarai (@cyphar) Giuseppe Scrivano (@giuseppe) +Akihiro Suda (@AkihiroSuda) +Kir Kolyshkin (@kolyshkin) +Sebastiaan van Stijn (@thaJeztah) diff --git a/RELEASES.md b/RELEASES.md index cd48def..96a640b 100644 --- a/RELEASES.md +++ b/RELEASES.md @@ -48,7 +48,7 @@ For example if a breaking change is introduced in v1.0.0-rc2 then the series would end with v1.0.0-rc4 and v1.0.0. * Minor and patch releases SHOULD be made on an as-needed basis. -[charter]: https://www.opencontainers.org/about/governance +[charter]: https://github.com/opencontainers/tob/blob/main/CHARTER.md ## Checklist diff --git a/config-linux.md b/config-linux.md index 178361f..08b4653 100644 --- a/config-linux.md +++ b/config-linux.md @@ -336,6 +336,11 @@ To disable it, specify a value of `true`. * **`useHierarchy`** *(bool, OPTIONAL)* - enables or disables hierarchical memory accounting. If enabled (`true`), child cgroups will share the memory limits of this cgroup. +* **`checkBeforeUpdate`** *(bool, OPTIONAL)* - enables container memory usage check before setting a new limit. + If enabled (`true`), runtime MAY check if a new memory limit is lower than the current usage, and MUST + reject the new limit. Practically, when cgroup v1 is used, the kernel rejects the limit lower than the + current usage, and when cgroup v2 is used, an OOM killer is invoked. This setting can be used on + cgroup v2 to mimic the cgroup v1 behavior. #### Example @@ -360,6 +365,9 @@ * **`shares`** *(uint64, OPTIONAL)* - specifies a relative share of CPU time available to the tasks in a cgroup * **`quota`** *(int64, OPTIONAL)* - specifies the total amount of time in microseconds for which all tasks in a cgroup can run during one period (as defined by **`period`** below) + If specified with any (valid) positive value, it MUST be no smaller than `burst` (runtimes MAY generate an error). +* **`burst`** *(uint64, OPTIONAL)* - specifies the maximum amount of accumulated time in microseconds for which all tasks in a cgroup can run additionally for burst during one period (as defined by **`period`** below) + If specified, this value MUST be no larger than any positive `quota` (runtimes MAY generate an error). * **`period`** *(uint64, OPTIONAL)* - specifies a period of time in microseconds for how regularly a cgroup's access to CPU resources should be reallocated (CFS scheduler only) * **`realtimeRuntime`** *(int64, OPTIONAL)* - specifies a period of time in microseconds for the longest continuous period in which the tasks in a cgroup have access to CPU resources * **`realtimePeriod`** *(uint64, OPTIONAL)* - same as **`period`** but applies to realtime scheduler only @@ -373,6 +381,7 @@ "cpu": { "shares": 1024, "quota": 1000000, + "burst": 1000000, "period": 500000, "realtimeRuntime": 950000, "realtimePeriod": 1000000, @@ -701,6 +710,7 @@ * `SECCOMP_FILTER_FLAG_TSYNC` * `SECCOMP_FILTER_FLAG_LOG` * `SECCOMP_FILTER_FLAG_SPEC_ALLOW` + * `SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV` * **`listenerPath`** *(string, OPTIONAL)* - specifies the path of UNIX domain socket over which the runtime will send the [container process state](#containerprocessstate) data structure when the `SCMP_ACT_NOTIFY` action is used. This socket MUST use `AF_UNIX` domain and `SOCK_STREAM` type. diff --git a/config.md b/config.md index fe28de6..b4aef6a 100644 --- a/config.md +++ b/config.md @@ -353,6 +353,18 @@ ```json "hostname": "mrsdalloway" +``` + +## Domainname + +* **`domainname`** (string, OPTIONAL) specifies the container's domainname as seen by processes running inside the container. + On Linux, for example, this will change the domainname in the [container](glossary.md#container-namespace) [UTS namespace][uts-namespace.7]. + Depending on your [namespace configuration](config-linux.md#namespaces), the container UTS namespace may be the [runtime](glossary.md#runtime-namespace) [UTS namespace][uts-namespace.7]. + +### Example + +```json +"domainname": "foobarbaz.test" ``` ## Platform-specific configuration @@ -430,8 +442,9 @@ ### Prestart -The `prestart` hooks MUST be called after the [`start`](runtime.md#start) operation is called but [before the user-specified program command is executed](runtime.md#lifecycle). +The `prestart` hooks MUST be called as part of the [`create`](runtime.md#create) operation after the runtime environment has been created (according to the configuration in config.json) but before the `pivot_root` or any equivalent operation has been executed. On Linux, for example, they are called after the container namespaces are created, so they provide an opportunity to customize the container (e.g. the network namespace could be specified in this hook). +The `prestart` hooks MUST be called before the `createRuntime` hooks. Note: `prestart` hooks were deprecated in favor of `createRuntime`, `createContainer` and `startContainer` hooks, which allow more granular hook control during the create and start phase. @@ -448,8 +461,6 @@ On Linux, for example, they are called after the container namespaces are created, so they provide an opportunity to customize the container (e.g. the network namespace could be specified in this hook). The definition of `createRuntime` hooks is currently underspecified and hooks authors, should only expect from the runtime that the mount namespace have been created and the mount operations performed. Other operations such as cgroups and SELinux/AppArmor labels might not have been performed by the runtime. - -Note: `runc` originally implemented `prestart` hooks contrary to the spec, namely as part of the `create` operation (instead of during the `start` operation). This incorrect implementation actually corresponds to `createRuntime` hooks. For runtimes that implement the deprecated `prestart` hooks as `createRuntime` hooks, `createRuntime` hooks MUST be called after the `prestart` hooks. ### CreateContainer Hooks diff --git a/schema/config-linux.json b/schema/config-linux.json index d551afb..eb22872 100644 --- a/schema/config-linux.json +++ b/schema/config-linux.json @@ -109,6 +109,9 @@ }, "quota": { "$ref": "defs.json#/definitions/int64" + }, + "burst": { + "$ref": "defs.json#/definitions/uint64" }, "realtimePeriod": { "$ref": "defs.json#/definitions/uint64" @@ -169,6 +172,9 @@ }, "useHierarchy": { "type": "boolean" + }, + "checkBeforeUpdate": { + "type": "boolean" } } }, diff --git a/schema/config-schema.json b/schema/config-schema.json index a4d1274..cf66c65 100644 --- a/schema/config-schema.json +++ b/schema/config-schema.json @@ -33,6 +33,9 @@ "$ref": "defs.json#/definitions/annotations" }, "hostname": { + "type": "string" + }, + "domainname": { "type": "string" }, "mounts": { diff --git a/schema/defs-linux.json b/schema/defs-linux.json index 5727802..ff36288 100644 --- a/schema/defs-linux.json +++ b/schema/defs-linux.json @@ -70,7 +70,8 @@ "enum": [ "SECCOMP_FILTER_FLAG_TSYNC", "SECCOMP_FILTER_FLAG_LOG", - "SECCOMP_FILTER_FLAG_SPEC_ALLOW" + "SECCOMP_FILTER_FLAG_SPEC_ALLOW", + "SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV" ] }, "SeccompOperators": { diff --git a/schema/test/config/good/spec-example.json b/schema/test/config/good/spec-example.json index a784d1d..2bf285b 100644 --- a/schema/test/config/good/spec-example.json +++ b/schema/test/config/good/spec-example.json @@ -63,6 +63,7 @@ "readonly": true }, "hostname": "slartibartfast", + "domainname": "foobarbaz.test", "mounts": [ { "destination": "/proc", @@ -269,11 +270,13 @@ "kernelTCP": -1, "swappiness": 0, "disableOOMKiller": false, - "useHierarchy": false + "useHierarchy": false, + "checkBeforeUpdate": false }, "cpu": { "shares": 1024, "quota": 1000000, + "burst": 1000000, "period": 500000, "realtimeRuntime": 950000, "realtimePeriod": 1000000, diff --git a/specs-go/config.go b/specs-go/config.go index cf1b338..5b4f691 100644 --- a/specs-go/config.go +++ b/specs-go/config.go @@ -12,6 +12,8 @@ Root *Root `json:"root,omitempty"` // Hostname configures the container's hostname. Hostname string `json:"hostname,omitempty"` + // Domainname configures the container's domainname. + Domainname string `json:"domainname,omitempty"` // Mounts configures additional mounts (on top of Root). Mounts []Mount `json:"mounts,omitempty"` // Hooks configures callbacks for container lifecycle events. @@ -317,6 +319,10 @@ DisableOOMKiller *bool `json:"disableOOMKiller,omitempty"` // Enables hierarchical memory accounting UseHierarchy *bool `json:"useHierarchy,omitempty"` + // CheckBeforeUpdate enables checking if a new memory limit is lower + // than the current usage during update, and if so, rejecting the new + // limit. + CheckBeforeUpdate *bool `json:"checkBeforeUpdate,omitempty"` } // LinuxCPU for Linux cgroup 'cpu' resource management @@ -325,6 +331,9 @@ Shares *uint64 `json:"shares,omitempty"` // CPU hardcap limit (in usecs). Allowed cpu time in a given period. Quota *int64 `json:"quota,omitempty"` + // CPU hardcap burst limit (in usecs). Allowed accumulated cpu time additionally for burst in a + // given period. + Burst *uint64 `json:"burst,omitempty"` // CPU period to be used for hardcapping (in usecs). Period *uint64 `json:"period,omitempty"` // How much time realtime scheduling may use (in usecs). @@ -643,6 +652,10 @@ // LinuxSeccompFlagSpecAllow can be used to disable Speculative Store // Bypass mitigation. (since Linux 4.17) LinuxSeccompFlagSpecAllow LinuxSeccompFlag = "SECCOMP_FILTER_FLAG_SPEC_ALLOW" + + // LinuxSeccompFlagWaitKillableRecv can be used to switch to the wait + // killable semantics. (since Linux 5.19) + LinuxSeccompFlagWaitKillableRecv LinuxSeccompFlag = "SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV" ) // Additional architectures permitted to be used for system calls