Add compensating controls framework and date-based report dirs #320

Merged
eblume merged 2 commits from compensating-controls into main 2026-03-30 17:44:12 -07:00
9 changed files with 487 additions and 44 deletions
Showing only changes of commit 4b85e8ca73 - Show all commits

Add compensating controls framework with review tooling

Introduce compensating-controls.yaml to track named controls that
justify suppressed security findings. Each control has a description,
verification notes, and last-reviewed date.

Update all Prowler mutelist descriptions to reference controls via
"CC: <id>" prefix instead of restating findings. Nine controls cover:
single-user-cluster, tailscale-network-isolation, local-registry,
sso-gated-admin-tools, operator-managed-pods, ephemeral-privileged-jobs,
trusted-ci-only, init-container-isolation, observability-stack-audit.

Add mise task (review-compensating-controls) that surfaces the most
stale control with all codebase references, and how-to doc
([[review-compensating-controls]]) explaining the review process.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Erich Blume 2026-03-30 17:35:48 -07:00

View file

@ -1,5 +1,4 @@
# Minikube apiserver — flags managed by static pod manifests.
# Compensating control: cluster not internet-exposed; access via Tailscale ACLs.
Mutelist:
Accounts:
"*":
@ -7,48 +6,48 @@ Mutelist:
"apiserver_always_pull_images_plugin":
Regions: ["*"]
Resources: ["^kube-apiserver-minikube$"]
Description: "Minikube default; AlwaysPullImages not enabled."
Description: "CC: single-user-cluster, local-registry. Only the operator has cluster access; all images pulled from private zot registry."
"apiserver_audit_log_maxage_set":
Regions: ["*"]
Resources: ["^kube-apiserver-minikube$"]
Description: "Minikube does not configure audit logging."
Description: "CC: observability-stack-audit. Alloy/Loki provides pod-level audit trail."
"apiserver_audit_log_maxbackup_set":
Regions: ["*"]
Resources: ["^kube-apiserver-minikube$"]
Description: "Minikube does not configure audit logging."
Description: "CC: observability-stack-audit. Alloy/Loki provides pod-level audit trail."
"apiserver_audit_log_maxsize_set":
Regions: ["*"]
Resources: ["^kube-apiserver-minikube$"]
Description: "Minikube does not configure audit logging."
Description: "CC: observability-stack-audit. Alloy/Loki provides pod-level audit trail."
"apiserver_audit_log_path_set":
Regions: ["*"]
Resources: ["^kube-apiserver-minikube$"]
Description: "Minikube does not configure audit logging."
Description: "CC: observability-stack-audit. Alloy/Loki provides pod-level audit trail."
"apiserver_deny_service_external_ips":
Regions: ["*"]
Resources: ["^kube-apiserver-minikube$"]
Description: "Minikube default; no external IPs in use."
Description: "CC: tailscale-network-isolation. No external IPs routable; cluster only reachable via tailnet."
"apiserver_disable_profiling":
Regions: ["*"]
Resources: ["^kube-apiserver-minikube$"]
Description: "Minikube default; profiling endpoint not exposed."
Description: "CC: tailscale-network-isolation. Profiling endpoint unreachable from public internet."
"apiserver_encryption_provider_config_set":
Regions: ["*"]
Resources: ["^kube-apiserver-minikube$"]
Description: "Minikube does not configure etcd encryption at rest."
Description: "CC: tailscale-network-isolation, single-user-cluster. Etcd not network-exposed; only operator has node access."
"apiserver_kubelet_cert_auth":
Regions: ["*"]
Resources: ["^kube-apiserver-minikube$"]
Description: "Minikube manages kubelet certificates automatically."
Description: "CC: tailscale-network-isolation. Kubelet API not exposed outside the node; minikube auto-generates certificates."
"apiserver_request_timeout_set":
Regions: ["*"]
Resources: ["^kube-apiserver-minikube$"]
Description: "Minikube default; using K8s default timeout."
Description: "CC: tailscale-network-isolation. API server only reachable via tailnet; DoS risk limited to trusted clients."
"apiserver_service_account_lookup_true":
Regions: ["*"]
Resources: ["^kube-apiserver-minikube$"]
Description: "Minikube default."
Description: "CC: single-user-cluster. Only operator manages service accounts; no revoked tokens in circulation."
"apiserver_strong_ciphers_only":
Regions: ["*"]
Resources: ["^kube-apiserver-minikube$"]
Description: "Minikube default TLS cipher suite."
Description: "CC: tailscale-network-isolation. API server traffic encrypted by WireGuard at the network layer."

View file

@ -1,5 +1,4 @@
# Minikube control-plane components — managed by static pod manifests.
# Compensating control: cluster not internet-exposed; access via Tailscale ACLs.
Mutelist:
Accounts:
"*":
@ -7,12 +6,12 @@ Mutelist:
"controllermanager_disable_profiling":
Regions: ["*"]
Resources: ["^kube-controller-manager-minikube$"]
Description: "Minikube default; profiling endpoint not exposed outside tailnet."
Description: "CC: tailscale-network-isolation. Profiling endpoint unreachable from public internet."
"scheduler_profiling":
Regions: ["*"]
Resources: ["^kube-scheduler-minikube$"]
Description: "Minikube default; profiling endpoint not exposed outside tailnet."
Description: "CC: tailscale-network-isolation. Profiling endpoint unreachable from public internet."
"kubelet_tls_cert_and_key":
Regions: ["*"]
Resources: ["^kubelet-config$"]
Description: "Minikube uses auto-generated kubelet certificates."
Description: "CC: tailscale-network-isolation, single-user-cluster. Kubelet API not exposed outside node; minikube auto-generates certificates."

View file

@ -7,7 +7,7 @@ Mutelist:
"core_minimize_hostNetwork_containers":
Regions: ["*"]
Resources:
# Minikube control plane — requires hostNetwork by design
# Minikube control plane
- "^etcd-minikube$"
- "^kube-apiserver-minikube$"
- "^kube-controller-manager-minikube$"
@ -17,8 +17,9 @@ Mutelist:
- "^kindnet-"
- "^storage-provisioner$"
Description: >-
Control-plane and networking pods require hostNetwork.
All managed by minikube.
CC: tailscale-network-isolation. Control-plane and networking
pods require hostNetwork by design. Host network itself is
only reachable via tailnet.
"core_minimize_privileged_containers":
Regions: ["*"]
Resources:
@ -27,12 +28,13 @@ Mutelist:
# Tailscale operator-managed proxies
- "^ts-"
- "^ingress-"
# Forgejo runner — Docker-in-Docker for CI builds
# Forgejo runner
- "^forgejo-runner-"
Description: >-
kube-proxy: iptables (minikube). ts-*/ingress-*: network
namespace manipulation (Tailscale operator). forgejo-runner:
Docker-in-Docker for CI.
CC: single-user-cluster, operator-managed-pods, trusted-ci-only.
kube-proxy: system pod, single-user cluster. ts-*/ingress-*:
Tailscale operator-managed. forgejo-runner: DinD limited to
trusted private forge repos.
"core_seccomp_profile_docker_default":
Regions: ["*"]
Resources:
@ -47,34 +49,38 @@ Mutelist:
- "^nameserver-"
- "^ingress-"
Description: >-
System pods (minikube) and Tailscale operator pods — seccomp
profiles set by upstream/operator, not user manifests.
CC: single-user-cluster, operator-managed-pods. System pods
managed by minikube and Tailscale operator; seccomp profiles
set by upstream. Single-user cluster limits exploit surface.
"core_minimize_hostPID_containers":
Regions: ["*"]
Resources:
- "^prowler-"
Description: >-
Prowler CIS scanner requires hostPID to check file
permissions on kubelet and etcd data directories.
CC: ephemeral-privileged-jobs. Prowler CIS scanner requires
hostPID for file permission checks. Runs as CronJob with
7-day TTL, not a persistent workload.
"core_minimize_root_containers_admission":
Regions: ["*"]
Resources:
- "^grafana-"
Description: >-
Grafana init-chown-data runs as root to fix PVC ownership.
Main containers run as UID 472. Standard pattern.
CC: init-container-isolation. Root limited to init-chown-data
container; all runtime containers run as UID 472 with caps
dropped.
"core_minimize_containers_added_capabilities":
Regions: ["*"]
Resources:
# Minikube system pods
- "^coredns-"
- "^kindnet-"
# Grafana init-chown-data (CHOWN capability)
# Grafana init-chown-data
- "^grafana-"
Description: >-
System pods: NET_BIND_SERVICE/NET_RAW required by function
(minikube). Grafana: CHOWN for PVC init; all other
containers drop ALL.
CC: single-user-cluster, init-container-isolation. System
pods: capabilities required by function (minikube-managed).
Grafana: CHOWN limited to init phase; runtime containers
drop ALL.
"core_minimize_containers_capabilities_assigned":
Regions: ["*"]
Resources:
@ -82,5 +88,5 @@ Mutelist:
- "^kindnet-"
- "^grafana-"
Description: >-
System pods (minikube) and Grafana init-chown-data.
See core_minimize_containers_added_capabilities.
CC: single-user-cluster, init-container-isolation. See
core_minimize_containers_added_capabilities.

View file

@ -10,12 +10,12 @@ Mutelist:
# Built-in Kubernetes roles
- "^cluster-admin$"
- "^system:"
# ArgoCD — requires broad access for deployment management;
# ArgoCD itself is SSO-gated via Authentik
# ArgoCD
- "^argocd-"
Description: >-
Built-in K8s roles and ArgoCD. ArgoCD access is SSO-gated
via Authentik.
CC: single-user-cluster, sso-gated-admin-tools. Built-in
K8s roles: only operator can bind them. ArgoCD: requires
broad access but is SSO-gated via Authentik OIDC.
"rbac_minimize_pod_creation_access":
Regions: ["*"]
Resources:
@ -26,12 +26,14 @@ Mutelist:
# CloudNativePG operator
- "^cnpg-manager$"
Description: >-
Built-in K8s roles required for workload controllers.
cnpg-manager: CloudNativePG operator manages PostgreSQL pods.
CC: single-user-cluster. Built-in K8s roles and CNPG
operator. Only the operator can assign these roles; no
untrusted users have cluster access.
"rbac_minimize_service_account_token_creation":
Regions: ["*"]
Resources:
- "^system:"
Description: >-
kube-controller-manager requires token creation for service
account management. Built-in role.
CC: single-user-cluster. kube-controller-manager requires
token creation for SA management. Only operator manages
service accounts.

122
compensating-controls.yaml Normal file
View file

@ -0,0 +1,122 @@
# Compensating Controls
#
# Documents controls that mitigate risks from suppressed or accepted security
# findings. Referenced by security tools (Prowler mutelist, Kingfisher config,
# etc.) via "CC: <id>" in finding descriptions or suppression notes.
#
# Used by `mise run review-compensating-controls` to surface stale controls.
#
# Fields:
# id - kebab-case unique identifier, referenced from tool configs
# description - what the control actually does to mitigate risk
# created - date (YYYY-MM-DD) the control was documented
# last-reviewed - date (YYYY-MM-DD) or null
# notes - optional context
controls:
- id: single-user-cluster
description: >-
Only the cluster operator (eblume) has kubectl access. No untrusted
users can create pods, access cached images, or bind RBAC roles.
created: 2026-03-30
last-reviewed: 2026-03-30
notes: >-
Verify by checking kubeconfig distribution and Tailscale ACLs.
If additional users gain cluster access, re-evaluate all findings
muted under this control.
- id: tailscale-network-isolation
description: >-
Cluster is not internet-exposed. All access requires Tailscale
identity with ACL enforcement. Profiling endpoints, debug ports,
and control-plane APIs are unreachable from the public internet.
created: 2026-03-30
last-reviewed: 2026-03-30
notes: >-
Verify with 'tailscale serve status --json' on indri and review
Tailscale ACLs in pulumi/tailscale/. Only tag:flyio-target services
are publicly routable.
- id: local-registry
description: >-
All container images are pulled from private zot registry
(registry.ops.eblu.me). No shared external registry credentials
are cached on cluster nodes.
created: 2026-03-30
last-reviewed: 2026-03-30
notes: >-
Verify by checking image prefixes in kustomization.yaml files.
Upstream images (immich, ollama) are exceptions — track in
service-versions.yaml.
- id: sso-gated-admin-tools
description: >-
ArgoCD and Grafana require SSO authentication via Authentik OIDC.
Wildcard RBAC in ArgoCD is mitigated by requiring authenticated
identity before any API access.
created: 2026-03-30
last-reviewed: 2026-03-30
notes: >-
Verify Authentik provider config and that anonymous access is
disabled. Check ArgoCD --auth-token isn't leaked.
- id: operator-managed-pods
description: >-
Tailscale operator manages proxy pod specs (ts-*, ingress-*,
operator-*, nameserver-*). Pod security settings are set by the
operator, not user manifests. Operator is tracked in
service-versions.yaml and regularly updated.
created: 2026-03-30
last-reviewed: 2026-03-30
notes: >-
Verify operator version is current via 'mise run service-review'.
Check Tailscale changelog for security fixes. If operator adds
seccomp support, remove these mutes.
- id: ephemeral-privileged-jobs
description: >-
Prowler CIS scanner runs as a CronJob with 7-day TTL
auto-deletion, not as a persistent privileged workload. hostPID
exposure is time-bounded to scan duration (~20s).
created: 2026-03-30
last-reviewed: 2026-03-30
notes: >-
Verify TTL is set in cronjob.yaml. Check that no persistent
pods run with hostPID.
- id: trusted-ci-only
description: >-
Forgejo runner only executes workflows from repos on the private
forge (forge.ops.eblu.me). No external or untrusted repos can
trigger privileged CI jobs.
created: 2026-03-30
last-reviewed: 2026-03-30
notes: >-
Verify runner registration is limited to the forge instance.
Check Forgejo runner config for repo allow-lists.
- id: init-container-isolation
description: >-
Root privileges and added capabilities (CHOWN) are limited to
init containers that run once at pod startup. All runtime
containers run as non-root (UID 472) with all capabilities
dropped.
created: 2026-03-30
last-reviewed: 2026-03-30
notes: >-
Verify by inspecting grafana deployment.yaml securityContext
for both init and runtime containers. If fsGroup alone can
handle PVC ownership, remove init-chown-data and this control.
- id: observability-stack-audit
description: >-
Alloy collects pod logs and ships them to Loki, providing an
audit trail for cluster activity. Compensates for missing
apiserver audit logging which minikube does not configure.
created: 2026-03-30
last-reviewed: 2026-03-30
notes: >-
Verify Alloy DaemonSet is running and Loki is receiving logs.
Note this is weaker than native apiserver audit logs — it
captures pod stdout/stderr, not API request-level auditing.
Consider enabling minikube audit logging if supported.

View file

@ -0,0 +1 @@
Add compensating controls framework: tracking file, review mise task, and how-to doc. Map all Prowler mutelist entries to named controls with CC: prefixes.

View file

@ -0,0 +1,77 @@
---
title: Review Compensating Controls
modified: 2026-03-30
last-reviewed: 2026-03-30
tags:
- how-to
- security
- maintenance
---
# Review Compensating Controls
How to periodically review compensating controls that justify suppressed security findings.
## Review by Staleness
Show controls sorted by when they were last reviewed (most stale first):
```bash
mise run review-compensating-controls
```
This reads `compensating-controls.yaml` (repo root), sorts by `last-reviewed`, and displays the most stale control with all codebase references. It also searches for every file that references the control ID, so you can see exactly which suppressed findings depend on it.
To show more entries:
```bash
mise run review-compensating-controls --limit 20
```
## What is a Compensating Control?
A compensating control is a security measure that mitigates the risk a finding was designed to detect, when the finding itself cannot be directly remediated. For example:
- **Finding:** API server does not enable AlwaysPullImages admission plugin
- **Risk:** Untrusted users could run pods using cached images they shouldn't have access to
- **Compensating control:** `single-user-cluster` — only the operator has kubectl access; no untrusted users can create pods
Controls are documented in `compensating-controls.yaml` and referenced from security tool configurations (Prowler mutelist files, Kingfisher config, etc.) using the format `CC: <control-id>`.
## Review Process
For each control up for review:
1. **Understand the risk.** Read each suppressed finding that references this control. What attack or misconfiguration does the original check guard against?
2. **Verify the control is in effect.** Follow the verification steps in the control's `notes` field. For example, for `tailscale-network-isolation`, check that the cluster is not directly internet-exposed and Tailscale ACLs are enforced.
3. **Assess whether the control actually mitigates the risk.** A compensating control should address the same threat the check was designed to catch, not just be a vaguely related security measure. If it doesn't hold up, either:
- Fix the underlying finding and remove the suppression
- Document a stronger or more specific compensating control
4. **Check for changed circumstances.** Has the cluster gained new users? Has a service been exposed publicly? Has an operator added native support for the missing feature? Any of these could invalidate the control.
5. **Update the review date.** Edit `compensating-controls.yaml` and set `last-reviewed` to today's date. Commit alongside any changes.
## Adding a New Control
When suppressing a new security finding, either map it to an existing control or add a new one:
```yaml
- id: my-new-control
description: >-
What this control does and how it mitigates the specific risk.
created: 2026-03-30
last-reviewed: 2026-03-30
notes: >-
How to verify this control is still in effect.
```
Then reference it in the suppression configuration with `CC: my-new-control`.
## Related
- [[security]] — Security posture overview
- [[read-compliance-reports]] — Accessing and interpreting Prowler reports
- [[review-services]] — Periodic service version review (similar staleness pattern)

View file

@ -46,6 +46,14 @@ Security posture and compliance scanning for BlumeOps infrastructure.
All compliance scan reports are stored on `sifaka:/volume1/reports/`. See [[read-compliance-reports]] for access and interpretation.
## Compensating controls
Suppressed findings reference named compensating controls tracked in `compensating-controls.yaml` (repo root). Each control has a review date and verification steps. See [[review-compensating-controls]] for the review process.
```bash
mise run review-compensating-controls
```
## Known gaps
- No SOC 2 compliance mapping for Kubernetes (Prowler only maps SOC 2 for AWS/Azure/GCP)

View file

@ -0,0 +1,229 @@
#!/usr/bin/env -S uv run --script
# /// script
# requires-python = ">=3.12"
# dependencies = ["pyyaml>=6.0.2", "rich>=14.0.0", "typer>=0.24.0"]
# ///
#MISE description="Review the most stale compensating control"
#USAGE flag "--limit <limit>" default="10" help="Number of controls to show in the table"
"""Review compensating controls by staleness.
Reads ``compensating-controls.yaml`` and sorts by ``last-reviewed``.
Shows a staleness table, then displays the most stale control with all
references found in the codebase.
After reviewing, update the control entry:
last-reviewed: YYYY-MM-DD
Usage: mise run review-compensating-controls [--limit 10]
"""
import subprocess
import sys
from datetime import date
from pathlib import Path
from typing import Annotated
import typer
import yaml
from rich.console import Console
from rich.panel import Panel
from rich.table import Table
CONTROLS_FILE = Path(__file__).parent.parent / "compensating-controls.yaml"
REPO_ROOT = Path(__file__).parent.parent
def load_controls(path: Path) -> list[dict]:
data = yaml.safe_load(path.read_text())
return data.get("controls", [])
def parse_date(raw) -> date | None:
if raw is None:
return None
if isinstance(raw, date):
return raw
try:
return date.fromisoformat(str(raw))
except ValueError:
return None
def find_references(control_id: str) -> list[str]:
"""Find all files referencing a control ID using ripgrep."""
try:
result = subprocess.run(
["rg", "--no-heading", "-n", control_id, str(REPO_ROOT)],
capture_output=True,
text=True,
timeout=10,
)
lines = result.stdout.strip().splitlines()
# Exclude the controls file itself and this script
return [
ln
for ln in lines
if "compensating-controls.yaml" not in ln
and "review-compensating-controls" not in ln
]
except (FileNotFoundError, subprocess.TimeoutExpired):
return []
def main(
limit: Annotated[
int, typer.Option(help="Number of controls to show in the table")
] = 10,
) -> None:
console = Console()
today = date.today()
if not CONTROLS_FILE.exists():
console.print(
f"[bold red]Controls file not found:[/bold red] {CONTROLS_FILE}"
)
raise typer.Exit(code=1)
controls = load_controls(CONTROLS_FILE)
# Parse dates and build sortable entries
entries: list[tuple[dict, date | None]] = []
for ctrl in controls:
reviewed = parse_date(ctrl.get("last-reviewed"))
entries.append((ctrl, reviewed))
# Sort: never-reviewed first, then oldest
entries.sort(key=lambda e: (e[1] is not None, e[1] or date.min))
never_reviewed = sum(1 for _, r in entries if r is None)
# --- Summary panel ---
console.print()
console.print(
Panel(
f"[bold]{len(entries)}[/bold] compensating controls, "
f"[bold red]{never_reviewed}[/bold red] never reviewed",
title="[bold]Compensating Control Review Queue[/bold]",
border_style="cyan",
)
)
console.print()
# --- Staleness table ---
table = Table(show_header=True, header_style="bold")
table.add_column("#", justify="right")
table.add_column("Control ID")
table.add_column("Last Reviewed", justify="right")
table.add_column("Age (days)", justify="right")
table.add_column("Refs", justify="right")
for i, (ctrl, reviewed) in enumerate(entries[:limit], 1):
control_id = ctrl["id"]
refs = len(find_references(control_id))
if reviewed is None:
table.add_row(
str(i),
f"[red]{control_id}[/red]",
"[red]never[/red]",
"[red]—[/red]",
str(refs),
)
else:
age = (today - reviewed).days
style = "yellow" if age > 90 else ""
id_str = f"[{style}]{control_id}[/{style}]" if style else control_id
date_str = f"[{style}]{reviewed}[/{style}]" if style else str(reviewed)
age_str = f"[{style}]{age}[/{style}]" if style else str(age)
table.add_row(str(i), id_str, date_str, age_str, str(refs))
remaining = len(entries) - limit
if remaining > 0:
table.add_row("", f"[dim]… {remaining} more[/dim]", "", "", "")
console.print(table)
console.print()
# --- Most stale control detail ---
if not entries:
console.print("[bold red]No controls found![/bold red]")
raise typer.Exit(code=1)
top_ctrl, top_reviewed = entries[0]
control_id = top_ctrl["id"]
refs = find_references(control_id)
detail_lines = [
f"[bold cyan]{control_id}[/bold cyan]",
f"[dim]Last reviewed: {top_reviewed or 'never'}[/dim]",
"",
f"[bold]Description:[/bold] {top_ctrl.get('description', '').strip()}",
]
notes = top_ctrl.get("notes", "").strip()
if notes:
detail_lines.append(f"[bold]Notes:[/bold] {notes}")
console.print(
Panel(
"\n".join(detail_lines),
title="[bold]Up For Review[/bold]",
border_style="green",
)
)
console.print()
# --- References ---
if refs:
ref_table = Table(
show_header=True, header_style="bold", title="References in codebase"
)
ref_table.add_column("File", style="cyan")
ref_table.add_column("Line")
for ref in refs:
# rg output: file:line:content
parts = ref.split(":", 2)
if len(parts) >= 3:
filepath = parts[0].replace(str(REPO_ROOT) + "/", "")
line_no = parts[1]
content = parts[2].strip()
ref_table.add_row(f"{filepath}:{line_no}", content)
else:
ref_table.add_row(ref, "")
console.print(ref_table)
else:
console.print(
f"[yellow]No references to '{control_id}' found in the codebase.[/yellow]"
)
console.print()
# --- Review checklist ---
checklist = [
"[bold]Verification:[/bold]\n",
f"• {notes}\n" if notes else "",
"\n[bold]Review each reference:[/bold]\n",
"• For each muted finding referencing this control, confirm:\n",
" 1. The risk the original check guards against\n",
" 2. That this control actually mitigates that risk\n",
" 3. That the control is still in effect (not degraded or bypassed)\n",
"\n[bold]After review:[/bold]\n",
f"• Update compensating-controls.yaml: [cyan]last-reviewed: {today}[/cyan]\n",
"• If the control is no longer valid, either:\n",
" - Fix the underlying finding and remove the mute, or\n",
" - Document a new/updated compensating control\n",
"• Commit the change",
]
console.print(
Panel(
"".join(checklist),
title="[bold yellow]Review Guidance[/bold yellow]",
border_style="yellow",
)
)
if __name__ == "__main__":
typer.run(main)