Compare commits

...
Sign in to create a new pull request.

2 commits

Author SHA1 Message Date
3e2c481034 C1: drop CC: prefixes from mutelist entries; remove CC tooling
Strips the "CC: <id>." prefix from every Description field in the
Prowler mutelist YAML files (and the statement field in trivyignore).
Each entry's free-form description now stands on its own.

Deletes compensating-controls.yaml (the CC registry) and the
review-compensating-controls mise task. Updates
review-compliance-reports to drop CC references from docstrings,
panel text, and table titles. Node verification logic is unchanged.
2026-05-22 20:09:39 -07:00
69737dc915 C1: docs-first removal of compensating-controls framework
Deletes the CC how-to and explanation docs, and the orphan changelog
fragments describing CC reviews. Updates security.md and
read-compliance-reports.md to describe muting in terms of the mutelist
files only. Adds the branch changelog fragment.

Mutelist YAML files, the Prowler CronJobs, and the
review-compliance-reports task all stay — they're updated in the next
commit.
2026-05-22 20:09:28 -07:00
21 changed files with 72 additions and 758 deletions

View file

@ -6,48 +6,48 @@ Mutelist:
"apiserver_always_pull_images_plugin":
Regions: ["*"]
Resources: ["^kube-apiserver-minikube$"]
Description: "CC: single-user-cluster, local-registry. Only the operator has cluster access; all images pulled from private zot registry."
Description: "Only the operator has cluster access; all images pulled from private zot registry."
"apiserver_audit_log_maxage_set":
Regions: ["*"]
Resources: ["^kube-apiserver-minikube$"]
Description: "CC: observability-stack-audit. Alloy/Loki provides pod-level audit trail."
Description: "Alloy/Loki provides pod-level audit trail."
"apiserver_audit_log_maxbackup_set":
Regions: ["*"]
Resources: ["^kube-apiserver-minikube$"]
Description: "CC: observability-stack-audit. Alloy/Loki provides pod-level audit trail."
Description: "Alloy/Loki provides pod-level audit trail."
"apiserver_audit_log_maxsize_set":
Regions: ["*"]
Resources: ["^kube-apiserver-minikube$"]
Description: "CC: observability-stack-audit. Alloy/Loki provides pod-level audit trail."
Description: "Alloy/Loki provides pod-level audit trail."
"apiserver_audit_log_path_set":
Regions: ["*"]
Resources: ["^kube-apiserver-minikube$"]
Description: "CC: observability-stack-audit. Alloy/Loki provides pod-level audit trail."
Description: "Alloy/Loki provides pod-level audit trail."
"apiserver_deny_service_external_ips":
Regions: ["*"]
Resources: ["^kube-apiserver-minikube$"]
Description: "CC: tailscale-network-isolation. No external IPs routable; cluster only reachable via tailnet."
Description: "No external IPs routable; cluster only reachable via tailnet."
"apiserver_disable_profiling":
Regions: ["*"]
Resources: ["^kube-apiserver-minikube$"]
Description: "CC: tailscale-network-isolation. Profiling endpoint unreachable from public internet."
Description: "Profiling endpoint unreachable from public internet."
"apiserver_encryption_provider_config_set":
Regions: ["*"]
Resources: ["^kube-apiserver-minikube$"]
Description: "CC: tailscale-network-isolation, single-user-cluster. Etcd not network-exposed; only operator has node access."
Description: "Etcd not network-exposed; only operator has node access."
"apiserver_kubelet_cert_auth":
Regions: ["*"]
Resources: ["^kube-apiserver-minikube$"]
Description: "CC: tailscale-network-isolation. Kubelet API not exposed outside the node; minikube auto-generates certificates."
Description: "Kubelet API not exposed outside the node; minikube auto-generates certificates."
"apiserver_request_timeout_set":
Regions: ["*"]
Resources: ["^kube-apiserver-minikube$"]
Description: "CC: tailscale-network-isolation. API server only reachable via tailnet; DoS risk limited to trusted clients."
Description: "API server only reachable via tailnet; DoS risk limited to trusted clients."
"apiserver_service_account_lookup_true":
Regions: ["*"]
Resources: ["^kube-apiserver-minikube$"]
Description: "CC: single-user-cluster. Only operator manages service accounts; no revoked tokens in circulation."
Description: "Only operator manages service accounts; no revoked tokens in circulation."
"apiserver_strong_ciphers_only":
Regions: ["*"]
Resources: ["^kube-apiserver-minikube$"]
Description: "CC: tailscale-network-isolation. API server traffic encrypted by WireGuard at the network layer."
Description: "API server traffic encrypted by WireGuard at the network layer."

View file

@ -6,12 +6,12 @@ Mutelist:
"controllermanager_disable_profiling":
Regions: ["*"]
Resources: ["^kube-controller-manager-minikube$"]
Description: "CC: tailscale-network-isolation. Profiling endpoint unreachable from public internet."
Description: "Profiling endpoint unreachable from public internet."
"scheduler_profiling":
Regions: ["*"]
Resources: ["^kube-scheduler-minikube$"]
Description: "CC: tailscale-network-isolation. Profiling endpoint unreachable from public internet."
Description: "Profiling endpoint unreachable from public internet."
"kubelet_tls_cert_and_key":
Regions: ["*"]
Resources: ["^kubelet-config$"]
Description: "CC: tailscale-network-isolation, single-user-cluster. Kubelet API not exposed outside node; minikube auto-generates certificates."
Description: "Kubelet API not exposed outside node; minikube auto-generates certificates."

View file

@ -17,9 +17,8 @@ Mutelist:
- "^kindnet-"
- "^storage-provisioner$"
Description: >-
CC: tailscale-network-isolation. Control-plane and networking
pods require hostNetwork by design. Host network itself is
only reachable via tailnet.
Control-plane and networking pods require hostNetwork by design.
Host network itself is only reachable via tailnet.
"core_minimize_privileged_containers":
Regions: ["*"]
Resources:
@ -31,7 +30,6 @@ Mutelist:
# Forgejo runner
- "^forgejo-runner-"
Description: >-
CC: single-user-cluster, operator-managed-pods, trusted-ci-only.
kube-proxy: system pod, single-user cluster. ts-*/ingress-*:
Tailscale operator-managed. forgejo-runner: DinD limited to
trusted private forge repos.
@ -49,25 +47,24 @@ Mutelist:
- "^nameserver-"
- "^ingress-"
Description: >-
CC: single-user-cluster, operator-managed-pods. System pods
managed by minikube and Tailscale operator; seccomp profiles
set by upstream. Single-user cluster limits exploit surface.
System pods managed by minikube and Tailscale operator;
seccomp profiles set by upstream. Single-user cluster limits
exploit surface.
"core_minimize_hostPID_containers":
Regions: ["*"]
Resources:
- "^prowler-"
Description: >-
CC: ephemeral-privileged-jobs. Prowler CIS scanner requires
hostPID for file permission checks. Runs as CronJob with
7-day TTL, not a persistent workload.
Prowler CIS scanner requires hostPID for file permission
checks. Runs as CronJob with 7-day TTL, not a persistent
workload.
"core_minimize_root_containers_admission":
Regions: ["*"]
Resources:
- "^grafana-"
Description: >-
CC: init-container-isolation. Root limited to init-chown-data
container; all runtime containers run as UID 472 with caps
dropped.
Root limited to init-chown-data container; all runtime
containers run as UID 472 with caps dropped.
"core_minimize_containers_added_capabilities":
Regions: ["*"]
Resources:
@ -77,10 +74,9 @@ Mutelist:
# Grafana init-chown-data
- "^grafana-"
Description: >-
CC: single-user-cluster, init-container-isolation. System
pods: capabilities required by function (minikube-managed).
Grafana: CHOWN limited to init phase; runtime containers
drop ALL.
System pods: capabilities required by function
(minikube-managed). Grafana: CHOWN limited to init phase;
runtime containers drop ALL.
"core_minimize_containers_capabilities_assigned":
Regions: ["*"]
Resources:
@ -88,5 +84,4 @@ Mutelist:
- "^kindnet-"
- "^grafana-"
Description: >-
CC: single-user-cluster, init-container-isolation. See
core_minimize_containers_added_capabilities.
See core_minimize_containers_added_capabilities.

View file

@ -1,7 +1,7 @@
# Node-level and RBAC checks that Prowler reports as MANUAL because it
# cannot evaluate them from inside a pod. Compensated by automated
# verification in `mise run review-compliance-reports`, which SSHes into
# the minikube node and checks each condition directly every week.
# cannot evaluate them from inside a pod. Verified out-of-band by the
# node-verification block in `mise run review-compliance-reports`, which
# SSHes into the minikube node and checks each condition directly.
Mutelist:
Accounts:
"*":
@ -9,51 +9,51 @@ Mutelist:
"etcd_unique_ca":
Regions: ["*"]
Resources: ["^etcd-minikube$"]
Description: "CC: node-config-automated-verification. Etcd CA fingerprint verified different from cluster CA by review-compliance-reports."
Description: "Etcd CA fingerprint verified different from cluster CA by review-compliance-reports."
"kubelet_conf_file_ownership":
Regions: ["*"]
Resources: ["^kubelet-config$"]
Description: "CC: node-config-automated-verification. File ownership verified root:root by review-compliance-reports."
Description: "File ownership verified root:root by review-compliance-reports."
"kubelet_conf_file_permissions":
Regions: ["*"]
Resources: ["^kubelet-config$"]
Description: "CC: node-config-automated-verification. File permissions verified 600 by review-compliance-reports."
Description: "File permissions verified 600 by review-compliance-reports."
"kubelet_config_yaml_ownership":
Regions: ["*"]
Resources: ["^kubelet-config$"]
Description: "CC: node-config-automated-verification. File ownership verified root:root by review-compliance-reports."
Description: "File ownership verified root:root by review-compliance-reports."
"kubelet_config_yaml_permissions":
Regions: ["*"]
Resources: ["^kubelet-config$"]
Description: "CC: node-config-automated-verification. File permissions verified 644 by review-compliance-reports."
Description: "File permissions verified 644 by review-compliance-reports."
"kubelet_service_file_ownership_root":
Regions: ["*"]
Resources: ["^kubelet-config$"]
Description: "CC: node-config-automated-verification. File ownership verified root:root by review-compliance-reports."
Description: "File ownership verified root:root by review-compliance-reports."
"kubelet_service_file_permissions":
Regions: ["*"]
Resources: ["^kubelet-config$"]
Description: "CC: node-config-automated-verification. File permissions verified 644 by review-compliance-reports."
Description: "File permissions verified 644 by review-compliance-reports."
"kubelet_disable_read_only_port":
Regions: ["*"]
Resources: ["^kubelet-config$"]
Description: "CC: node-config-automated-verification. readOnlyPort absence (defaults to 0) verified by review-compliance-reports."
Description: "readOnlyPort absence (defaults to 0) verified by review-compliance-reports."
"kubelet_event_record_qps":
Regions: ["*"]
Resources: ["^kubelet-config$"]
Description: "CC: node-config-automated-verification. eventRecordQPS absence (defaults to 5) verified by review-compliance-reports."
Description: "eventRecordQPS absence (defaults to 5) verified by review-compliance-reports."
"kubelet_manage_iptables":
Regions: ["*"]
Resources: ["^kubelet-config$"]
Description: "CC: node-config-automated-verification. makeIPTablesUtilChains absence (defaults to true) verified by review-compliance-reports."
Description: "makeIPTablesUtilChains absence (defaults to true) verified by review-compliance-reports."
"kubelet_strong_ciphers_only":
Regions: ["*"]
Resources: ["^kubelet-config$"]
Description: "CC: node-config-automated-verification, tailscale-network-isolation. Go default ciphers used; all traffic WireGuard-encrypted via tailnet."
Description: "Go default ciphers used; all traffic WireGuard-encrypted via tailnet."
"rbac_cluster_admin_usage":
Regions: ["*"]
Resources:
- "^cluster-admin$"
- "^kubeadm:cluster-admins$"
- "^minikube-rbac$"
Description: "CC: node-config-automated-verification, single-user-cluster. Only built-in/minikube cluster-admin bindings present; verified by review-compliance-reports."
Description: "Only built-in/minikube cluster-admin bindings present; verified by review-compliance-reports."

View file

@ -13,9 +13,8 @@ Mutelist:
# ArgoCD
- "^argocd-"
Description: >-
CC: single-user-cluster, sso-gated-admin-tools. Built-in
K8s roles: only operator can bind them. ArgoCD: requires
broad access but is SSO-gated via Authentik OIDC.
Built-in K8s roles: only operator can bind them. ArgoCD:
requires broad access but is SSO-gated via Authentik OIDC.
"rbac_minimize_pod_creation_access":
Regions: ["*"]
Resources:
@ -26,14 +25,12 @@ Mutelist:
# CloudNativePG operator
- "^cnpg-manager$"
Description: >-
CC: single-user-cluster. Built-in K8s roles and CNPG
operator. Only the operator can assign these roles; no
untrusted users have cluster access.
Built-in K8s roles and CNPG operator. Only the operator can
assign these roles; no untrusted users have cluster access.
"rbac_minimize_service_account_token_creation":
Regions: ["*"]
Resources:
- "^system:"
Description: >-
CC: single-user-cluster. kube-controller-manager requires
token creation for SA management. Only operator manages
service accounts.
kube-controller-manager requires token creation for SA
management. Only operator manages service accounts.

View file

@ -14,26 +14,24 @@ misconfigurations:
paths:
- "argocd/manifests/external-secrets/rbac.yaml"
statement: >-
CC: operator-purpose-bound-rbac. external-secrets-operator's entire
function is to read and synthesize Secret objects; ClusterRole over
secrets is its purpose. Both the controller and cert-controller are
external-secrets-operator's entire function is to read and
synthesize Secret objects; ClusterRole over secrets is its
purpose. Both the controller and cert-controller are
upstream-defined.
- id: KSV-0041
paths:
- "argocd/manifests/kube-state-metrics/rbac.yaml"
- "argocd/manifests/kube-state-metrics-ringtail/rbac.yaml"
statement: >-
CC: kube-state-metrics-metadata-only. KSM exposes only Secret
metadata (name, namespace, type, labels), never the data field.
list/watch on secrets is required for kube_secret_info /
kube_secret_labels metrics.
KSM exposes only Secret metadata (name, namespace, type, labels),
never the data field. list/watch on secrets is required for
kube_secret_info / kube_secret_labels metrics.
- id: KSV-0114
paths:
- "argocd/manifests/external-secrets/rbac.yaml"
statement: >-
CC: operator-purpose-bound-rbac. cert-controller manages the
external-secrets validating webhook configurations to inject its
own rotating CA bundle. RBAC is scoped to two named webhooks
(secretstore-validate, externalsecret-validate) via resourceNames;
KSV-0114 doesn't see the resourceNames restriction so reports the
full ClusterRole.
cert-controller manages the external-secrets validating webhook
configurations to inject its own rotating CA bundle. RBAC is
scoped to two named webhooks (secretstore-validate,
externalsecret-validate) via resourceNames; KSV-0114 doesn't see
the resourceNames restriction so reports the full ClusterRole.

View file

@ -1,210 +0,0 @@
# Compensating Controls
#
# Documents controls that mitigate risks from suppressed or accepted security
# findings. Referenced by security tools (Prowler mutelist, Kingfisher config,
# etc.) via "CC: <id>" in finding descriptions or suppression notes.
#
# Used by `mise run review-compensating-controls` to surface stale controls.
#
# Fields:
# id - kebab-case unique identifier, referenced from tool configs
# description - what the control actually does to mitigate risk
# created - date (YYYY-MM-DD) the control was documented
# last-reviewed - date (YYYY-MM-DD) or null
# notes - optional context
controls:
- id: single-user-cluster
description: >-
Only the cluster operator (eblume) has kubectl access. No untrusted
users can create pods, access cached images, or bind RBAC roles.
created: 2026-03-30
last-reviewed: 2026-04-01
notes: >-
Verify by checking kubeconfig distribution and Tailscale ACLs.
If additional users gain cluster access, re-evaluate all findings
muted under this control.
- id: tailscale-network-isolation
description: >-
Cluster is not internet-exposed. All access requires Tailscale
identity with ACL enforcement. Profiling endpoints, debug ports,
and control-plane APIs are unreachable from the public internet.
created: 2026-03-30
last-reviewed: 2026-04-06
notes: >-
Verify with 'tailscale serve status --json' on indri and review
Tailscale ACLs in pulumi/tailscale/. Only tag:flyio-target services
are publicly routable.
- id: local-registry
description: >-
Operator-built services use a private zot registry
(registry.ops.eblu.me) for supply-chain control. Remaining
images are pulled from public registries without stored
credentials. No shared registry secrets are cached on cluster
nodes.
created: 2026-03-30
last-reviewed: 2026-04-12
notes: >-
Verify by checking image prefixes in kustomization.yaml files.
Known external-image categories: (1) upstream apps not yet
mirrored — immich, ollama, frigate, frigate-notify, valkey;
(2) infrastructure components — tailscale operator/proxy,
external-secrets, 1password-connect, forgejo-runner, docker
DinD, nvidia-device-plugin; (3) utility base images — busybox,
alpine (grafana init containers). Track upstream versions in
service-versions.yaml. Goal is to progressively mirror these
into zot.
- id: sso-gated-admin-tools
description: >-
ArgoCD requires SSO authentication via Authentik OIDC. Wildcard
RBAC roles are mitigated by requiring authenticated identity
before any API access.
created: 2026-03-30
last-reviewed: 2026-04-14
notes: >-
Verify Authentik OIDC provider config for ArgoCD and that
anonymous access is disabled. Check ArgoCD --auth-token isn't
leaked. The workflow-bot API key account is scoped to sync/get
only.
- id: operator-managed-pods
description: >-
Tailscale operator manages proxy pod specs (ts-*, ingress-*,
operator-*, nameserver-*). Pod security settings are set by the
operator, not user manifests. Operator is tracked in
service-versions.yaml and regularly updated.
created: 2026-03-30
last-reviewed: 2026-04-21
notes: >-
Verify operator version is current via 'mise run service-review'.
Check Tailscale changelog for security fixes. If operator adds
seccomp support, remove these mutes. As of 2026-04-21: still no
default seccomp on operator-generated pods (upstream issue #7359
open). A ProxyClass + generic device plugin can downgrade proxies
from privileged to NET_ADMIN+NET_RAW and set seccompProfile —
potential future remediation to remove the seccomp mute without
waiting for upstream defaults.
- id: ephemeral-privileged-jobs
description: >-
Prowler CIS scanner runs as a CronJob with 7-day TTL
auto-deletion, not as a persistent privileged workload. hostPID
exposure is time-bounded to scan duration (~20s).
created: 2026-03-30
last-reviewed: 2026-04-29
notes: >-
Verify TTL is set in cronjob.yaml. Check that no persistent
pods run with hostPID on the scanned cluster (indri). The
alloy-tracing DaemonSet on ringtail also uses hostPID but is
out of scope — Prowler only scans indri. Tracked in Todoist:
"prowler scan against ringtail" — once that lands, the
DaemonSet's hostPID+privileged posture will surface as a CIS
finding and need its own CC or remediation.
- id: trusted-ci-only
description: >-
Forgejo runner only executes workflows from repos on the private
forge (forge.ops.eblu.me). No external or untrusted repos can
trigger privileged CI jobs.
created: 2026-03-30
last-reviewed: 2026-05-01
notes: >-
Verification: (1) Runner config (argocd/manifests/forgejo-runner/
config.yaml) connects only to https://forge.ops.eblu.me/. (2) Forge
app.ini has DISABLE_REGISTRATION=true and ALLOW_ONLY_EXTERNAL_REGISTRATION
=true (ansible/roles/forgejo/defaults/main.yml) — no untrusted users
can sign up or create repos. The runner registers at instance scope
(repo_id=0/owner_id=0 in action_runner table), but the instance itself
is closed, so no per-repo allow-list is needed. Re-evaluate if the
forge ever opens to additional users or if the runner is repointed
to an external forge.
- id: init-container-isolation
description: >-
Root privileges and added capabilities (CHOWN) are limited to
init containers that run once at pod startup. All runtime
containers run as non-root (UID 472) with all capabilities
dropped.
created: 2026-03-30
last-reviewed: 2026-05-04
notes: >-
Verify by inspecting grafana deployment.yaml securityContext
for both init and runtime containers. If fsGroup alone can
handle PVC ownership, remove init-chown-data and this control.
Retirement deferred until grafana lands on ringtail's k3s
(see [[indri-k8s-migration]]) — storage backend will change,
and removing init-chown-data right before that migration
trades a real safety net for marginal cleanup. Revisit
post-migration.
- id: node-config-automated-verification
description: >-
Prowler reports certain node-level checks as MANUAL because it runs
inside a pod and cannot evaluate kubelet file permissions, kubelet
config arguments, etcd CA separation, or cluster-admin RBAC bindings.
The review-compliance-reports script SSHes into the minikube node
weekly and programmatically verifies each condition, failing loudly
if any check deviates from expected values.
created: 2026-04-14
last-reviewed: 2026-04-14
notes: >-
Verification runs as part of 'mise run review-compliance-reports'.
If minikube node is unreachable, all checks report as FAIL. If new
MANUAL findings appear in Prowler, add corresponding verification
logic to the script and update the mutelist.
- id: operator-purpose-bound-rbac
description: >-
Operators whose entire function is to manage a sensitive resource
legitimately need RBAC over that resource. external-secrets-operator
manages Secret objects (its purpose) and the cert-controller mutates
its own ValidatingWebhookConfigurations to inject rotating CA bundles.
Risk is bounded by: (1) the operator code being upstream open-source
and reviewed; (2) RBAC scoped to specific named webhooks where
possible; (3) supply chain controls on the operator image (mirrored
to local registry, version tracked in service-versions.yaml).
created: 2026-04-27
last-reviewed: 2026-04-27
notes: >-
Verify by checking that the operators in question still match their
stated purpose (i.e. external-secrets is still the only consumer of
these ClusterRoles) and that upstream hasn't published advisories
for credential-handling bugs. Re-evaluate if a non-secrets-managing
ClusterRole appears under this control.
- id: kube-state-metrics-metadata-only
description: >-
kube-state-metrics holds list/watch on Secrets cluster-wide but only
exposes Secret object *metadata* (name, namespace, type, creation
timestamp, labels) via the kube_secret_info / kube_secret_labels
metrics. Secret data fields are never read into KSM's exposed
metrics by upstream design. Mitigation rests on KSM's metric
schema, the version pin in service-versions.yaml, and the metrics
endpoint being reachable only on the cluster network.
created: 2026-04-27
last-reviewed: 2026-04-27
notes: >-
Verify by inspecting the /metrics endpoint output for any series
that include secret data (only *_info and *_labels metrics should
reference secrets, and labels should be limited to user-applied
labels — never the data:). Re-evaluate on KSM version bumps.
- id: observability-stack-audit
description: >-
Alloy collects pod logs and ships them to Loki, providing an
audit trail for cluster activity. Compensates for missing
apiserver audit logging which neither minikube (indri) nor
k3s (ringtail) configures by default.
created: 2026-03-30
last-reviewed: 2026-05-11
notes: >-
Verify Alloy DaemonSet is running on each cluster (alloy-k8s on
minikube, alloy-ringtail on k3s) and Loki is receiving logs.
Note this is weaker than native apiserver audit logs — it
captures pod stdout/stderr, not API request-level auditing.
Consider enabling apiserver audit logging on k3s post-migration
(`--audit-log-path` / `--audit-policy-file`) — minikube made it
hard, k3s makes it straightforward.

View file

@ -1 +0,0 @@
New explanation article [[compliance-mute-categories]] documenting the gap between current `CC:`-only mute tagging and the three structurally distinct categories (compensating control, not-applicable, risk-accepted) needed for real PCI DSS / SOC2 practice. Captures the current image-scan mutelist gap (`cronjob-image-scan.yaml` doesn't pass `--mutelist-file`) and proposes an order-of-operations for wiring it up alongside the new tag conventions. Triggered by CVE-2026-31789, an OpenSSL 32-bit-only finding that surfaced the need for an NA category.

View file

@ -1 +0,0 @@
Reviewed compensating control `ephemeral-privileged-jobs`: TTL and hostPID scope verified on indri. Noted that the alloy-tracing DaemonSet on ringtail is out of scope until Prowler scans ringtail (tracked in Todoist).

View file

@ -1 +0,0 @@
Reviewed compensating control `init-container-isolation` (35 days stale). Grafana's running pod matches the manifest and the CC's claim — only `init-chown-data` runs as root with `CHOWN`; runtime containers all run as UID 472 with all caps dropped. Retirement (replacing init-chown-data with `fsGroup` alone) is plausible given the in-tree minikube-hostpath provisioner, but deferred until grafana lands on ringtail's k3s — note added to the CC.

View file

@ -1 +0,0 @@
Reviewed compensating control `trusted-ci-only`: Forgejo runner is registered only to the private forge, which has registration disabled — no untrusted users can create repos or trigger privileged CI. Tightened the notes to reflect that the closed-forge property (not a per-repo allow-list) is what actually mitigates the risk.

View file

@ -1 +1 @@
Address the 6 critical Prowler IaC findings against `argocd/manifests/`. Prowler's IaC provider hardcodes `self._mutelist = None` and delegates filtering to Trivy, but doesn't plumb `--ignorefile` through — so the documented "use Trivy filtering" path is actually broken. Added a shim around `trivy` in the Prowler image that injects `--ignorefile $TRIVY_IGNOREFILE` for `trivy fs` invocations when the env var points at a real file. The IaC cronjob now mounts `mutelist/trivyignore.yaml` (Trivy's per-path schema) and sets the env var. Two new compensating controls — `operator-purpose-bound-rbac` and `kube-state-metrics-metadata-only` — justify muting the `external-secrets` and `kube-state-metrics` Secret-access findings (KSV-0041, KSV-0114). Separately, `grafana-clusterrole` is tightened to remove `secrets` access entirely: the dashboard sidecar already only consumes ConfigMap-labeled dashboards, so its `RESOURCE` env var is now `configmap` instead of `both`.
Address the 6 critical Prowler IaC findings against `argocd/manifests/`. Prowler's IaC provider hardcodes `self._mutelist = None` and delegates filtering to Trivy, but doesn't plumb `--ignorefile` through — so the documented "use Trivy filtering" path is actually broken. Added a shim around `trivy` in the Prowler image that injects `--ignorefile $TRIVY_IGNOREFILE` for `trivy fs` invocations when the env var points at a real file. The IaC cronjob now mounts `mutelist/trivyignore.yaml` (Trivy's per-path schema) and sets the env var, muting the `external-secrets` and `kube-state-metrics` Secret-access findings (KSV-0041, KSV-0114). Separately, `grafana-clusterrole` is tightened to remove `secrets` access entirely: the dashboard sidecar already only consumes ConfigMap-labeled dashboards, so its `RESOURCE` env var is now `configmap` instead of `both`.

View file

@ -1 +0,0 @@
Reviewed compensating control `observability-stack-audit`. Updated description to cover ringtail's k3s as well as indri's minikube; both Alloy DaemonSets and Loki are healthy.

View file

@ -0,0 +1 @@
Ripped out the compensating-controls (CC) framework: deleted `compensating-controls.yaml`, the `review-compensating-controls` mise task, and the associated how-to / explanation docs. Prowler and Kingfisher continue to run weekly and produce reports; the Prowler mutelist YAML files remain in place but no longer carry `CC: <id>` prefixes — each entry just keeps a free-form `Description` of why the finding is muted. The CC review cadence proved to be more overhead than this single-operator homelab needed.

View file

@ -1,99 +0,0 @@
---
title: Compliance Mute Categories
modified: 2026-05-04
last-reviewed: 2026-05-04
tags:
- explanation
- security
- compliance
---
# Compliance Mute Categories
> **Note:** This article was drafted by AI and reviewed by Erich. I plan to rewrite all explanatory content in my own words - these serve as placeholders to establish the documentation structure.
How BlumeOps should categorize muted compliance findings, why a single "compensating control" tag is not enough, and what tooling work is needed to support multiple categories cleanly.
## Why this matters
When a compliance scanner ([[prowler]], Trivy via Prowler IaC, Kingfisher) reports a failing finding, there are three structurally different reasons we might suppress it:
1. **Compensating control (CC)** — the requirement applies and we *do not* meet it directly, but an alternative control mitigates the same risk.
2. **Not applicable (NA)** — the requirement's preconditions cannot be satisfied in our environment, so the finding is structurally inert (e.g. a 32-bit-only CVE on 64-bit-only hosts).
3. **Risk accepted (RA)** — the requirement applies, we do not meet it, no compensating control exists, and we have explicitly chosen to accept the residual risk for a bounded period.
Today every muted finding in BlumeOps uses the `CC: <id>` convention. That conflates all three categories. In a real PCI DSS or SOC2 environment, auditors treat them very differently:
- A CC requires documentation of the constraint, the alternative measure, and recurring validation that the measure still works.
- An NA requires documentation of *why* the precondition cannot be met, with periodic verification that the environmental fact still holds.
- An RA requires an explicit decision-maker, an expiry date, and a scheduled re-decision.
Mixing them under one tag means stale CCs hide stale RAs, and NAs that should be revisited when the environment changes get treated as permanent fixtures.
## Trigger case: CVE-2026-31789
The 2026-05-03 weekly compliance review surfaced [CVE-2026-31789](https://nvd.nist.gov/vuln/detail/CVE-2026-31789), an OpenSSL heap buffer overflow during X.509 certificate processing on **32-bit systems**. Prowler's image scanner flagged 216 findings across 106 BlumeOps images carrying `libssl3` / `libcrypto3` below the fixed versions.
The CVE is genuine, but its preconditions cannot be satisfied in our environment: indri is Apple Silicon (arm64), ringtail is x86_64, and we run no 32-bit containers. This is the canonical NA case — not a CC, because there is no "alternative measure mitigating the risk." The risk does not exist for us at all.
A CC like `no-32bit-runtimes` would technically work, but conflates the categories: if we ever introduce a 32-bit runtime we would have to remember that this CC was load-bearing for the mute, retire or scope it down, and reopen the muted findings. An NA tag with a short justification makes the precondition explicit and self-documents the conditions under which it must be revisited.
## Current tooling state
Three Prowler scans run weekly. Their mute paths today:
| Scan | Mute mechanism | File(s) |
|------|----------------|---------|
| K8s CIS (Sunday) | Prowler `--mutelist-file`, merged from ConfigMap | `argocd/manifests/prowler/mutelist/*.yaml` |
| IaC (Saturday) | Trivy `--ignorefile` shim (Prowler's `--mutelist-file` is a no-op for IaC) | `argocd/manifests/prowler/mutelist/trivyignore.yaml` |
| Container Images (Saturday) | **None — `cronjob-image-scan.yaml` does not pass `--mutelist-file`** | n/a |
The image scan has never been wired to a mutelist. The CSV reports do contain a `MUTED` column, but it is always `False` because no mutelist is supplied. All 14k+ image findings flow through to `review-compliance-reports` unfiltered.
The mute tag convention is consistent across the two configured scans: each entry's `Description:` (or `statement:` for trivyignore) starts with `CC: <id>. <freeform>`. `mise run review-compensating-controls` greps for those IDs to find every file that depends on each control. There is no NA tag, no RA tag, and no expiry field.
## Proposed model
### Tag prefixes
Extend the description-prefix convention:
- `CC: <control-id>. <description>` — references an entry in `compensating-controls.yaml`. Existing convention, unchanged.
- `NA: <reason>. <description>` — environmental precondition fails. Reason should be specific enough that a reviewer can verify it (e.g. `NA: no 32-bit runtimes`, not `NA: doesn't apply`).
- `RA: <reason>; expires <YYYY-MM-DD>. <description>` — explicit risk acceptance with a hard expiry. Past the expiry, re-review is mandatory.
Tag choice is exclusive: a given mute is one of CC, NA, or RA. If two reasons apply, pick the strongest — CC > RA > NA.
### Tooling changes required
1. **Wire the image scan to a mutelist.** Add `argocd/manifests/prowler/mutelist/image-cves.yaml`, mount-and-merge it the same way `cronjob.yaml` mounts its mutelist parts, and pass `--mutelist-file` to `prowler image`. Verify experimentally that `prowler image` honors the flag — Prowler's behavior across providers is inconsistent, and the IaC provider notably does not. If `prowler image` ignores it, fall back to post-scan filtering inside `review-compliance-reports`.
2. **Teach `review-compensating-controls` (or a sibling) to surface NA and RA entries.** CCs already get a staleness queue. NAs should appear in a separate queue keyed on the reason text — when an NA reason becomes false (e.g. we do introduce a 32-bit runtime), every NA mute citing that reason must be reopened. RAs should sort by expiry date, with anything past expiry flagged red.
3. **Expiry parsing.** RA tags carry a hard date. The simplest path is to parse it from the description string at review time. A more durable path is to extend the mutelist YAML schema with a structured `expires:` field and a small wrapper that strips it before passing the file to Prowler. Either works; the structured field is friendlier to editors.
### Out of scope (for now)
- Changing the underlying Prowler mutelist YAML schema. Stay within the `Mutelist:` shape Prowler expects.
- Migrating existing `CC:` entries. The current set is genuinely CCs and should stay tagged that way.
- Building an issue-tracker integration. Todoist is the source of truth for "remember to re-review this" until that scales painfully.
## Order of operations
When this work is picked up, the suggested sequence is:
1. **Scope and confirm.** Re-read this article, confirm the model still fits, adjust if not.
2. **Wire the image-scan mutelist.** Smallest atomic change; produces immediate value (the CVE-2026-31789 mute can land as the first NA entry).
3. **Add the NA convention.** Update [[read-compliance-reports]] and [[review-compensating-controls]] how-tos to describe the three tag prefixes. The convention can land before tooling supports it — review will just be manual until tooling catches up.
4. **Extend the review tools.** Add NA and RA queues to `review-compensating-controls` (or a new task). At this point, parse expiry from RA descriptions.
5. **Optionally: structured expiry.** If RA entries become common, migrate to a structured `expires:` YAML field with a wrapper that filters it out before Prowler reads the file.
The first three steps are a coherent C1. Steps 45 can be split off if scope creeps.
## Related
- [[read-compliance-reports]] — the weekly review process this feeds into
- [[review-compensating-controls]] — current CC review tooling
- [[security-model]] — overall security posture
- [[prowler]] — scanner reference
- [[agent-change-process]] — how to scope and execute the implementation

View file

@ -80,7 +80,7 @@ Not all failures require action. Common expected failures in our minikube cluste
1. **Triage** — review new failures, distinguish real issues from expected noise
2. **Remediate** — fix what you can (pod security contexts, RBAC tightening)
3. **Mutelist** — suppress expected/accepted failures via Prowler's `--mutelist-file` to reduce noise in future scans
3. **Mutelist** — suppress expected/accepted failures by adding a Resource entry under the matching Check in `argocd/manifests/prowler/mutelist/*.yaml` with a free-form `Description` explaining why
4. **Track** — compare reports over time to spot regressions
## Related

View file

@ -1,50 +0,0 @@
---
title: Record Review Evidence
modified: 2026-04-01
last-reviewed: 2026-04-01
tags:
- how-to
- security
- compliance
---
# Record Review Evidence
How review evidence *would* be captured after a [[review-compensating-controls|compensating control review]], to make the review auditable under a compliance framework.
blumeops does not currently collect review evidence. This card documents the target process for reference and practice.
## Why Record Evidence?
Reviewing a control and updating `last-reviewed` proves the review *happened* but not *what was checked*. Under frameworks like PCI DSS v4.0, a QSA needs to see dated, immutable evidence that the reviewer verified the control and that an appropriate party accepted the residual risk. Compliance platforms like Drata automate this collection, but the underlying artifacts are the same whether you use a platform or a directory of files.
## What Evidence Would Be Captured
For each control reviewed, artifacts should answer:
1. **Who reviewed it** — reviewer name, date
2. **What was verified** — the specific checks performed (e.g., Tailscale ACL policy snapshot, `tailscale status` output, kubectl auth checks)
3. **What was found** — the outcome: control still in effect, circumstances changed, or control invalidated
4. **Residual risk** — what the control does *not* cover (the gap a QSA will ask about)
5. **Acceptance** — formal sign-off that the residual risk is accepted by an appropriate party (reviewer + approver, typically a manager or CTO)
Supporting artifacts would include command output, policy snapshots, screenshots, or API responses — anything that demonstrates the verification was actually performed.
## PCI DSS Context
Under PCI DSS v4.0, compensating controls require a **Compensating Control Worksheet (CCW)** that maps each control to the original requirement it substitutes for. The CCW fields are:
- **Original requirement** — the specific PCI DSS requirement not directly met
- **Constraint** — why direct compliance isn't feasible
- **Compensating control definition** — what is done instead
- **Risk addressed** — how the control mitigates the original threat
- **Residual risk** — what remains unmitigated
- **Validation procedure** — steps to verify (what `notes` captures in `compensating-controls.yaml`)
Req 12.3.2 mandates review **at least annually** (quarterly is typical for Level 1 Service Providers). In a platform like Drata, these map to Controls with uploaded Evidence and review workflows requiring sign-off from both the reviewer and an approver.
## Related
- [[review-compensating-controls]] — The technical review process
- [[security]] — Security posture overview
- [[read-compliance-reports]] — Interpreting Prowler/Kingfisher reports

View file

@ -1,80 +0,0 @@
---
title: Review Compensating Controls
modified: 2026-03-30
last-reviewed: 2026-03-30
tags:
- how-to
- security
- maintenance
---
# Review Compensating Controls
How to periodically review compensating controls that justify suppressed security findings.
## Review by Staleness
Show controls sorted by when they were last reviewed (most stale first):
```bash
mise run review-compensating-controls
```
This reads `compensating-controls.yaml` (repo root), sorts by `last-reviewed`, and displays the most stale control with all codebase references. It also searches for every file that references the control ID, so you can see exactly which suppressed findings depend on it.
To show more entries:
```bash
mise run review-compensating-controls --limit 20
```
## What is a Compensating Control?
A compensating control is a security measure that mitigates the risk a finding was designed to detect, when the finding itself cannot be directly remediated. For example:
- **Finding:** API server does not enable AlwaysPullImages admission plugin
- **Risk:** Untrusted users could run pods using cached images they shouldn't have access to
- **Compensating control:** `single-user-cluster` — only the operator has kubectl access; no untrusted users can create pods
Controls are documented in `compensating-controls.yaml` and referenced from security tool configurations (Prowler mutelist files, Kingfisher config, etc.) using the format `CC: <control-id>`.
A compensating control is only one of three structurally distinct ways to suppress a finding — see [[compliance-mute-categories]] for when to reach for a CC versus a not-applicable (`NA:`) or risk-accepted (`RA:`) tag instead.
## Review Process
For each control up for review:
1. **Understand the risk.** Read each suppressed finding that references this control. What attack or misconfiguration does the original check guard against?
2. **Verify the control is in effect.** Follow the verification steps in the control's `notes` field. For example, for `tailscale-network-isolation`, check that the cluster is not directly internet-exposed and Tailscale ACLs are enforced.
3. **Assess whether the control actually mitigates the risk.** A compensating control should address the same threat the check was designed to catch, not just be a vaguely related security measure. If it doesn't hold up, either:
- Fix the underlying finding and remove the suppression
- Document a stronger or more specific compensating control
4. **Check for changed circumstances.** Has the cluster gained new users? Has a service been exposed publicly? Has an operator added native support for the missing feature? Any of these could invalidate the control.
5. **Update the review date.** Edit `compensating-controls.yaml` and set `last-reviewed` to today's date. Commit alongside any changes.
## Adding a New Control
When suppressing a new security finding, either map it to an existing control or add a new one:
```yaml
- id: my-new-control
description: >-
What this control does and how it mitigates the specific risk.
created: 2026-03-30
last-reviewed: 2026-03-30
notes: >-
How to verify this control is still in effect.
```
Then reference it in the suppression configuration with `CC: my-new-control`.
## Related
- [[record-review-evidence]] — Capturing evidence artifacts for audit (aspirational)
- [[security]] — Security posture overview
- [[read-compliance-reports]] — Accessing and interpreting Prowler reports
- [[review-services]] — Periodic service version review (similar staleness pattern)

View file

@ -46,13 +46,7 @@ Security posture and compliance scanning for BlumeOps infrastructure.
All compliance scan reports are stored on `sifaka:/volume1/reports/`. See [[read-compliance-reports]] for access and interpretation.
## Compensating controls
Suppressed findings reference named compensating controls tracked in `compensating-controls.yaml` (repo root). Each control has a review date and verification steps. See [[review-compensating-controls]] for the review process.
```bash
mise run review-compensating-controls
```
Suppressed findings are kept in Prowler mutelist YAML under `argocd/manifests/prowler/mutelist/`. Each entry's `Description` field explains why the finding is muted; entries are reviewed ad-hoc rather than on a scheduled cadence.
## Known gaps

View file

@ -1,229 +0,0 @@
#!/usr/bin/env -S uv run --script
# /// script
# requires-python = ">=3.12"
# dependencies = ["pyyaml==6.0.3", "rich==15.0.0", "typer==0.25.0"]
# ///
#MISE description="Review the most stale compensating control"
#USAGE flag "--limit <limit>" default="10" help="Number of controls to show in the table"
"""Review compensating controls by staleness.
Reads ``compensating-controls.yaml`` and sorts by ``last-reviewed``.
Shows a staleness table, then displays the most stale control with all
references found in the codebase.
After reviewing, update the control entry:
last-reviewed: YYYY-MM-DD
Usage: mise run review-compensating-controls [--limit 10]
"""
import subprocess
import sys
from datetime import date
from pathlib import Path
from typing import Annotated
import typer
import yaml
from rich.console import Console
from rich.panel import Panel
from rich.table import Table
CONTROLS_FILE = Path(__file__).parent.parent / "compensating-controls.yaml"
REPO_ROOT = Path(__file__).parent.parent
def load_controls(path: Path) -> list[dict]:
data = yaml.safe_load(path.read_text())
return data.get("controls", [])
def parse_date(raw) -> date | None:
if raw is None:
return None
if isinstance(raw, date):
return raw
try:
return date.fromisoformat(str(raw))
except ValueError:
return None
def find_references(control_id: str) -> list[str]:
"""Find all files referencing a control ID using ripgrep."""
try:
result = subprocess.run(
["rg", "--no-heading", "-n", control_id, str(REPO_ROOT)],
capture_output=True,
text=True,
timeout=10,
)
lines = result.stdout.strip().splitlines()
# Exclude the controls file itself and this script
return [
ln
for ln in lines
if "compensating-controls.yaml" not in ln
and "review-compensating-controls" not in ln
]
except (FileNotFoundError, subprocess.TimeoutExpired):
return []
def main(
limit: Annotated[
int, typer.Option(help="Number of controls to show in the table")
] = 10,
) -> None:
console = Console()
today = date.today()
if not CONTROLS_FILE.exists():
console.print(
f"[bold red]Controls file not found:[/bold red] {CONTROLS_FILE}"
)
raise typer.Exit(code=1)
controls = load_controls(CONTROLS_FILE)
# Parse dates and build sortable entries
entries: list[tuple[dict, date | None]] = []
for ctrl in controls:
reviewed = parse_date(ctrl.get("last-reviewed"))
entries.append((ctrl, reviewed))
# Sort: never-reviewed first, then oldest
entries.sort(key=lambda e: (e[1] is not None, e[1] or date.min))
never_reviewed = sum(1 for _, r in entries if r is None)
# --- Summary panel ---
console.print()
console.print(
Panel(
f"[bold]{len(entries)}[/bold] compensating controls, "
f"[bold red]{never_reviewed}[/bold red] never reviewed",
title="[bold]Compensating Control Review Queue[/bold]",
border_style="cyan",
)
)
console.print()
# --- Staleness table ---
table = Table(show_header=True, header_style="bold")
table.add_column("#", justify="right")
table.add_column("Control ID")
table.add_column("Last Reviewed", justify="right")
table.add_column("Age (days)", justify="right")
table.add_column("Refs", justify="right")
for i, (ctrl, reviewed) in enumerate(entries[:limit], 1):
control_id = ctrl["id"]
refs = len(find_references(control_id))
if reviewed is None:
table.add_row(
str(i),
f"[red]{control_id}[/red]",
"[red]never[/red]",
"[red]—[/red]",
str(refs),
)
else:
age = (today - reviewed).days
style = "yellow" if age > 90 else ""
id_str = f"[{style}]{control_id}[/{style}]" if style else control_id
date_str = f"[{style}]{reviewed}[/{style}]" if style else str(reviewed)
age_str = f"[{style}]{age}[/{style}]" if style else str(age)
table.add_row(str(i), id_str, date_str, age_str, str(refs))
remaining = len(entries) - limit
if remaining > 0:
table.add_row("", f"[dim]… {remaining} more[/dim]", "", "", "")
console.print(table)
console.print()
# --- Most stale control detail ---
if not entries:
console.print("[bold red]No controls found![/bold red]")
raise typer.Exit(code=1)
top_ctrl, top_reviewed = entries[0]
control_id = top_ctrl["id"]
refs = find_references(control_id)
detail_lines = [
f"[bold cyan]{control_id}[/bold cyan]",
f"[dim]Last reviewed: {top_reviewed or 'never'}[/dim]",
"",
f"[bold]Description:[/bold] {top_ctrl.get('description', '').strip()}",
]
notes = top_ctrl.get("notes", "").strip()
if notes:
detail_lines.append(f"[bold]Notes:[/bold] {notes}")
console.print(
Panel(
"\n".join(detail_lines),
title="[bold]Up For Review[/bold]",
border_style="green",
)
)
console.print()
# --- References ---
if refs:
ref_table = Table(
show_header=True, header_style="bold", title="References in codebase"
)
ref_table.add_column("File", style="cyan")
ref_table.add_column("Line")
for ref in refs:
# rg output: file:line:content
parts = ref.split(":", 2)
if len(parts) >= 3:
filepath = parts[0].replace(str(REPO_ROOT) + "/", "")
line_no = parts[1]
content = parts[2].strip()
ref_table.add_row(f"{filepath}:{line_no}", content)
else:
ref_table.add_row(ref, "")
console.print(ref_table)
else:
console.print(
f"[yellow]No references to '{control_id}' found in the codebase.[/yellow]"
)
console.print()
# --- Review checklist ---
checklist = [
"[bold]Verification:[/bold]\n",
f"• {notes}\n" if notes else "",
"\n[bold]Review each reference:[/bold]\n",
"• For each muted finding referencing this control, confirm:\n",
" 1. The risk the original check guards against\n",
" 2. That this control actually mitigates that risk\n",
" 3. That the control is still in effect (not degraded or bypassed)\n",
"\n[bold]After review:[/bold]\n",
f"• Update compensating-controls.yaml: [cyan]last-reviewed: {today}[/cyan]\n",
"• If the control is no longer valid, either:\n",
" - Fix the underlying finding and remove the mute, or\n",
" - Document a new/updated compensating control\n",
"• Commit the change",
]
console.print(
Panel(
"".join(checklist),
title="[bold yellow]Review Guidance[/bold yellow]",
border_style="yellow",
)
)
if __name__ == "__main__":
typer.run(main)

View file

@ -143,7 +143,10 @@ def _kubectl(args: str, timeout: int = 15) -> subprocess.CompletedProcess:
def run_node_verification(console: Console) -> None:
"""Verify node-level conditions that Prowler reports as MANUAL.
Compensating control: node-config-automated-verification
Prowler runs inside a pod and can't evaluate kubelet file permissions,
kubelet config arguments, etcd CA separation, or cluster-admin RBAC
bindings. We SSH into the minikube node and check each condition here,
failing loudly if any deviates from expected values.
"""
checks: list[tuple[str, str, bool]] = [] # (name, detail, passed)
@ -278,7 +281,7 @@ def run_node_verification(console: Console) -> None:
table = Table(
show_header=True,
header_style="bold",
title="Node Verification (CC: node-config-automated-verification)",
title="Node Verification (out-of-band checks for MANUAL findings)",
)
table.add_column("Check")
table.add_column("Detail")
@ -528,8 +531,8 @@ def summarize_report(
Panel(
f"[bold yellow]{len(latest['unmuted'])} unmuted failure(s) "
f"need triage.[/bold yellow]\n\n"
"For each: remediate or mute "
"(add to mutelist + compensating control).",
"For each: remediate, or add a Resource entry to the "
"matching check in argocd/manifests/prowler/mutelist/.",
title=f"{label} Verdict",
border_style="yellow",
)
@ -653,7 +656,6 @@ def main(
)
# --- Node-level MANUAL check verification ---
# Compensating control: node-config-automated-verification
# These checks verify conditions Prowler reports as MANUAL because it
# runs inside a pod and cannot evaluate them directly.
run_node_verification(console)