diff --git a/CLAUDE.md b/CLAUDE.md index f1d640e..60757ad 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -117,6 +117,17 @@ The goal is to eventually use only locally built containers in all cases, with full supply chain control via forge.ops.eblu.me repositories, mirroring source from upstream. +**After triggering a build** (manual dispatch or push to main), verify the +workflow succeeded before proceeding: + +```fish +mise run runner-logs # find the run number +mise run runner-logs # see jobs in the run +mise run runner-logs -j # fetch logs on failure +``` + +This also works for other forge repos (`--repo eblume/hermes`). + ## Third-Party Projects Ask user to mirror on forge first, then clone to `~/code/3rd//`. diff --git a/docs/changelog.d/+runner-logs-rewrite.bugfix.md b/docs/changelog.d/+runner-logs-rewrite.bugfix.md new file mode 100644 index 0000000..7962ac4 --- /dev/null +++ b/docs/changelog.d/+runner-logs-rewrite.bugfix.md @@ -0,0 +1 @@ +Rewrite `mise run runner-logs` CLI: list runs by run number (not task ID), drill into jobs per run, fetch logs via Forgejo web API instead of SSH+filesystem. Fixes broken log retrieval caused by incorrect hex path calculation and stale data directory. Added `--repo` to query any forge repo (e.g. sporks) and `--limit`/`-n` to control listing size (0 for all). diff --git a/docs/how-to/deployment/build-container-image.md b/docs/how-to/deployment/build-container-image.md index ce746b0..a0e7d03 100644 --- a/docs/how-to/deployment/build-container-image.md +++ b/docs/how-to/deployment/build-container-image.md @@ -68,6 +68,14 @@ mise run container-build-and-release --ref Use `--dry-run` to preview without dispatching. +After dispatching, verify the workflow succeeded with `runner-logs`: + +```bash +mise run runner-logs # find the new run number +mise run runner-logs # see jobs and their status +mise run runner-logs -j # fetch full logs (e.g. on failure) +``` + | Build file | Workflow | Runner | Registry tag | |------------|----------|--------|--------------| | `container.py` | `build-container.yaml` | `k8s` (indri) | `:vX.Y.Z-` | @@ -99,7 +107,7 @@ Container image tags include the git commit SHA they were built from (e.g. `v3.9 **The rule:** Production manifests must reference images built from a commit on main. After merging a PR that changed `containers//`: 1. The merge to main automatically triggers a rebuild (the `build-container.yaml` / `build-container-nix.yaml` workflows fire on pushes to `main` that touch `containers/**`) -2. Wait for the workflow to complete — check at `https://forge.eblu.me/eblume/blumeops/actions` +2. Wait for the workflow to complete — verify with `mise run runner-logs` (find the run, check status) 3. Find the new main-SHA tag: ```bash mise run container-list diff --git a/docs/reference/tools/mise-tasks.md b/docs/reference/tools/mise-tasks.md index 16bc10e..02b8859 100644 --- a/docs/reference/tools/mise-tasks.md +++ b/docs/reference/tools/mise-tasks.md @@ -57,7 +57,7 @@ Run `mise tasks --sort name` for the live list with descriptions. |------|-------------| | `branch-cleanup` | Delete merged branches (local and remote) | | `pr-comments` | List unresolved PR comments | -| `runner-logs` | View Forgejo Actions workflow logs | +| `runner-logs` | List Forgejo Actions runs and fetch job logs (supports `--repo`, `--limit`) | | `validate-workflows` | Validate workflow files against runner schema | | `mikado-branch-invariant-check` | Validate Mikado Branch Invariant on `mikado/*` branches | diff --git a/mise-tasks/runner-logs b/mise-tasks/runner-logs index ec51608..4db203d 100755 --- a/mise-tasks/runner-logs +++ b/mise-tasks/runner-logs @@ -3,22 +3,23 @@ # requires-python = ">=3.12" # dependencies = ["httpx>=0.28.1", "rich>=14.0.0", "typer>=0.24.0"] # /// -#MISE description="Get logs for a Forgejo Actions workflow run (indri or ringtail runner)" -#USAGE arg "" help="Runner filter: indri, ringtail, or all" -#USAGE arg "[run_id]" help="Run ID to fetch logs for (omit to list recent runs)" -"""Fetch Forgejo Actions workflow logs from indri's log storage. - -Both the indri k8s runner and ringtail nix-container-builder runner report -logs back to the Forgejo server on indri. This tool lists recent runs -(optionally filtered by runner) and fetches compressed logs by run ID. +#MISE description="List recent Forgejo Actions runs or fetch logs for a specific job" +#USAGE arg "[run_number]" help="Run number to show jobs for (omit to list recent runs)" +#USAGE flag "--job -j " help="Job index (0-based) to fetch logs for" +#USAGE flag "--runner -r " help="Filter listing by runner: indri, ringtail, or all" +#USAGE flag "--repo " help="Forge repo (owner/name), default eblume/blumeops" +#USAGE flag "--limit -n " help="Max runs to display (0 for all)" +"""List recent Forgejo Actions runs and fetch job logs. Usage: - mise run runner-logs all # list recent runs from all runners - mise run runner-logs ringtail # list recent ringtail runs - mise run runner-logs all 337 # fetch logs for run 337 + mise run runner-logs # list recent runs (default 15) + mise run runner-logs -n 0 # list ALL runs + mise run runner-logs -r ringtail # list recent ringtail runs + mise run runner-logs --repo eblume/hermes # list runs for a different repo + mise run runner-logs 474 # show jobs in run 474 + mise run runner-logs 474 -j 1 # fetch logs for job 1 of run 474 """ -import subprocess import sys from typing import Annotated @@ -27,9 +28,8 @@ import typer from rich.console import Console from rich.table import Table -FORGE_API = "https://forge.eblu.me/api/v1" -REPO = "eblume/blumeops" -ACTIONS_LOG_DIR = "/opt/homebrew/var/forgejo/data/actions_log/eblume/blumeops" +FORGE_URL = "https://forge.ops.eblu.me" +FORGE_API = f"{FORGE_URL}/api/v1" # Workflows using the ringtail nix-container-builder runner; everything else # runs on the indri k8s runner. @@ -42,89 +42,170 @@ def runner_for_workflow(workflow_id: str) -> str: return "ringtail" if workflow_id in RINGTAIL_WORKFLOWS else "indri" -def list_runs(runner: str, console: Console) -> None: - resp = httpx.get( - f"{FORGE_API}/repos/{REPO}/actions/tasks", - timeout=15, - ) - resp.raise_for_status() - runs = resp.json().get("workflow_runs", []) +def fetch_tasks(repo: str) -> list[dict]: + """Fetch all tasks from the Forgejo API, paginating if needed.""" + tasks: list[dict] = [] + page = 1 + while True: + resp = httpx.get( + f"{FORGE_API}/repos/{repo}/actions/tasks", + params={"page": page, "limit": 50}, + timeout=15, + ) + resp.raise_for_status() + batch = resp.json().get("workflow_runs", []) + if not batch: + break + tasks.extend(batch) + page += 1 + return tasks - table = Table(title=f"Recent runs (filter: {runner})") - table.add_column("ID", style="cyan", no_wrap=True) + +def list_runs(runner: str, repo: str, limit: int, console: Console) -> None: + """List recent workflow runs, grouped by run number.""" + tasks = fetch_tasks(repo) + + # Group tasks by run_number + runs: dict[int, list[dict]] = {} + for t in tasks: + rn = t["run_number"] + runs.setdefault(rn, []).append(t) + + table = Table(title=f"Recent runs — {repo} (filter: {runner})") + table.add_column("Run #", style="cyan", no_wrap=True) table.add_column("Status") table.add_column("Runner") - table.add_column("Name") + table.add_column("Jobs") table.add_column("Title") + table.add_column("Event") - for run in runs[:20]: - host = runner_for_workflow(run.get("workflow_id", "")) + shown = 0 + for rn in sorted(runs, reverse=True): + if limit > 0 and shown >= limit: + break + + jobs = sorted(runs[rn], key=lambda x: x["id"]) + workflow_id = jobs[0].get("workflow_id", "") + host = runner_for_workflow(workflow_id) if runner != "all" and host != runner: continue - status = run.get("status", "") - style = "green" if status == "success" else "red" if status == "failure" else "yellow" + + # Aggregate status: worst status wins + statuses = [j.get("status", "") for j in jobs] + if "failure" in statuses: + status, style = "failure", "red" + elif "running" in statuses or "waiting" in statuses: + status, style = "running", "yellow" + elif all(s == "success" for s in statuses): + status, style = "success", "green" + else: + status, style = statuses[0], "yellow" + + job_names = ", ".join(j.get("name", "?")[:30] for j in jobs) + title = (jobs[0].get("display_title") or "")[:40] + event = jobs[0].get("event", "") + table.add_row( - str(run["id"]), + str(rn), f"[{style}]{status}[/{style}]", host, - (run.get("name") or "")[:40], - (run.get("display_title") or "")[:30], + job_names, + title, + event, + ) + shown += 1 + + console.print(table) + console.print("\n[dim]Use: mise run runner-logs to see jobs in a run[/dim]") + console.print("[dim] mise run runner-logs -j N to fetch logs for job N[/dim]") + + +def show_jobs(run_number: int, repo: str, console: Console) -> None: + """Show the jobs within a specific run.""" + tasks = fetch_tasks(repo) + + jobs = sorted( + [t for t in tasks if t["run_number"] == run_number], + key=lambda x: x["id"], + ) + if not jobs: + typer.echo(f"Error: No jobs found for run #{run_number}", err=True) + raise typer.Exit(1) + + table = Table(title=f"Jobs in run #{run_number} — {repo}") + table.add_column("Job #", style="cyan", no_wrap=True) + table.add_column("Status") + table.add_column("Name") + table.add_column("Created") + + for i, job in enumerate(jobs): + status = job.get("status", "") + style = "green" if status == "success" else "red" if status == "failure" else "yellow" + table.add_row( + str(i), + f"[{style}]{status}[/{style}]", + job.get("name", ""), + job.get("created_at", ""), ) console.print(table) + console.print(f"\n[dim]Use: mise run runner-logs {run_number} -j N to fetch logs for job N[/dim]") -def fetch_log(run_id: int) -> None: - hex_subdir = f"{run_id:02x}" - log_file = f"{ACTIONS_LOG_DIR}/{hex_subdir}/{run_id}.log.zst" - - # All logs live on indri (the Forgejo server) regardless of runner - result = subprocess.run( - ["ssh", "indri", f"test -f '{log_file}' && zstd -d -c '{log_file}'"], - capture_output=True, - text=True, - ) - - if result.returncode == 0: - sys.stdout.write(result.stdout) - else: - typer.echo(f"Error: Log file not found for run {run_id}", err=True) - typer.echo(f"Expected path: {log_file}", err=True) - typer.echo("", err=True) - typer.echo("Available logs:", err=True) - avail = subprocess.run( - [ - "ssh", - "indri", - f"find '{ACTIONS_LOG_DIR}' -name '*.log.zst' -exec basename {{}} .log.zst \\; | sort -n | tail -10", - ], - capture_output=True, - text=True, +def fetch_log(run_number: int, job_index: int, repo: str) -> None: + """Fetch logs for a specific job via the Forgejo web endpoint.""" + url = f"{FORGE_URL}/{repo}/actions/runs/{run_number}/jobs/{job_index}/attempt/1/logs" + resp = httpx.get(url, timeout=30, follow_redirects=True) + if resp.status_code == 404: + typer.echo( + f"Error: No logs found for run #{run_number} job {job_index}", + err=True, ) - typer.echo(avail.stdout, err=True) + typer.echo(f"URL: {url}", err=True) raise typer.Exit(1) + resp.raise_for_status() + sys.stdout.write(resp.text) @app.command() def main( + run_number: Annotated[ + int | None, + typer.Argument(help="Run number to show jobs for (omit to list recent runs)"), + ] = None, + job: Annotated[ + int | None, + typer.Option("--job", "-j", help="Job index (0-based) to fetch logs for"), + ] = None, runner: Annotated[ str, - typer.Argument(help="Runner filter: indri, ringtail, or all"), - ], - run_id: Annotated[ - int | None, - typer.Argument(help="Run ID to fetch logs for (omit to list recent runs)"), - ] = None, + typer.Option("--runner", "-r", help="Filter listing by runner: indri, ringtail, or all"), + ] = "all", + repo: Annotated[ + str, + typer.Option("--repo", help="Forge repo (owner/name)"), + ] = "eblume/blumeops", + limit: Annotated[ + int, + typer.Option("--limit", "-n", help="Max runs to display (0 for all)"), + ] = 15, ) -> None: - """Get logs for a Forgejo Actions workflow run.""" + """List recent Forgejo Actions runs or fetch logs for a specific job.""" if runner not in ("indri", "ringtail", "all"): typer.echo(f"Error: runner must be 'indri', 'ringtail', or 'all', got '{runner}'") raise typer.Exit(1) - if run_id is None: - list_runs(runner, Console()) + console = Console() + + if run_number is None: + if job is not None: + typer.echo("Error: --job requires a run number", err=True) + raise typer.Exit(1) + list_runs(runner, repo, limit, console) + elif job is None: + show_jobs(run_number, repo, console) else: - fetch_log(run_id) + fetch_log(run_number, job, repo) if __name__ == "__main__":