## Summary Fixes the Facebook crawler spider trap that's been generating infinite recursive URLs like `/how-to/tutorials/tutorials/how-to/explanation/...` for several days. **Root cause:** Quartz SPA mode + nginx `try_files` fallback to `index.html` meant any fabricated URL returned the root HTML shell with HTTP 200. Crawlers followed relative links from those fake URLs, creating infinite recursion. **Fix:** - Disable Quartz SPA mode (`enableSPA: false`) — all pages are now fully static HTML - Replace nginx SPA fallback with `=404` + Quartz's static `404.html` - Remove `robots.txt` exclusions (no longer needed) **Docs cleanup (Obsidian.nvim compat no longer needed):** - Delete hand-curated category index files (`tutorials.md`, `reference.md`, `how-to.md`, `explanation.md`) — Quartz auto-generates folder pages - Delete `postgresql-storage.md` (redirect stub) and `migrate-forgejo-from-brew.md` (stale history) - Drop `docs-check-index` and `docs-check-filenames` prek hooks - Rewrite `docs-check-links` to allow path-based wiki-links (`[[path/to/file]]`) and only error on true ambiguity - Add `ai-docs` doc tree listing to replace index files for AI context - Add natural cross-links from reference cards to fix orphan docs ## Deployment and Testing - [ ] Merge and let the build pipeline run - [ ] Verify docs.eblu.me serves pages correctly with full page loads - [ ] Verify non-existent URLs return 404 - [ ] Monitor crawler traffic — should drop to near zero for fabricated URLs Reviewed-on: #290
236 lines
8.6 KiB
Text
Executable file
236 lines
8.6 KiB
Text
Executable file
#!/usr/bin/env -S uv run --script
|
|
# /// script
|
|
# requires-python = ">=3.12"
|
|
# dependencies = ["rich>=13.0.0"]
|
|
# ///
|
|
#MISE description="Validate all wiki-links point to existing doc files"
|
|
"""Validate that all wiki-links in documentation point to existing files.
|
|
|
|
This script scans all markdown files in the docs/ directory (excluding
|
|
changelog.d/), extracts wiki-links, and verifies each link target resolves
|
|
to an existing file.
|
|
|
|
Wiki-link formats supported:
|
|
- [[filename]] - resolves by stem (errors if ambiguous)
|
|
- [[path/to/file]] - resolves by relative path from docs root
|
|
- [[target|Display Text]] - either form with display text
|
|
- [[target#Heading]] - with anchor fragment (file part validated)
|
|
|
|
Resolution mirrors Quartz's "shortest" markdownLinkResolution:
|
|
bare names resolve when unique; use paths to disambiguate duplicates.
|
|
|
|
Usage: mise run docs-check-links
|
|
"""
|
|
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
from rich.console import Console
|
|
from rich.markup import escape
|
|
from rich.table import Table
|
|
|
|
DOCS_DIR = Path(__file__).parent.parent / "docs"
|
|
|
|
# Regex to match wiki-links: [[Target]] or [[Target|Display]]
|
|
WIKILINK_PATTERN = re.compile(r"\[\[([^\]|]+)(\|[^\]]+)?\]\]")
|
|
|
|
# Regex to match inline code (backticks)
|
|
INLINE_CODE_PATTERN = re.compile(r"`[^`]+`")
|
|
|
|
|
|
def extract_wikilinks(file_path: Path) -> list[tuple[str, int, bool]]:
|
|
"""Extract all wiki-link targets from a markdown file with line numbers.
|
|
|
|
Returns list of (target, line_num, has_spaces) tuples.
|
|
has_spaces is True if the target or pipe separator had surrounding spaces.
|
|
|
|
Ignores wiki-links inside inline code (backticks) as these are examples.
|
|
"""
|
|
content = file_path.read_text()
|
|
links = []
|
|
|
|
for line_num, line in enumerate(content.splitlines(), start=1):
|
|
# Remove inline code before searching for wiki-links
|
|
line_without_code = INLINE_CODE_PATTERN.sub("", line)
|
|
for match in WIKILINK_PATTERN.finditer(line_without_code):
|
|
raw_target = match.group(1)
|
|
target = raw_target.strip()
|
|
pipe_part = match.group(2) # "|Display" or None
|
|
# Check for spaces: in target, or around the pipe
|
|
has_spaces = raw_target != target
|
|
if pipe_part and (raw_target.endswith(" ") or pipe_part.startswith("| ")):
|
|
has_spaces = True
|
|
links.append((target, line_num, has_spaces))
|
|
|
|
return links
|
|
|
|
|
|
def main() -> int:
|
|
console = Console()
|
|
|
|
# Build lookup structures:
|
|
# - path_targets: set of relative paths without extension (e.g., "reference/services/alloy")
|
|
# - stem_to_paths: map from filename stem to list of paths (for ambiguity detection)
|
|
path_targets: set[str] = set()
|
|
stem_to_paths: dict[str, list[str]] = {}
|
|
|
|
for md_file in DOCS_DIR.rglob("*.md"):
|
|
if "changelog.d" in md_file.parts:
|
|
continue
|
|
stem = md_file.stem
|
|
rel_path_str = str(md_file.relative_to(DOCS_DIR).with_suffix(""))
|
|
path_targets.add(rel_path_str)
|
|
if stem not in stem_to_paths:
|
|
stem_to_paths[stem] = []
|
|
stem_to_paths[stem].append(rel_path_str)
|
|
|
|
# Special case: files at repo root copied into docs during build
|
|
REPO_ROOT = DOCS_DIR.parent
|
|
BUILD_TIME_DOCS = ["CHANGELOG.md"]
|
|
for filename in BUILD_TIME_DOCS:
|
|
if (REPO_ROOT / filename).exists():
|
|
stem = Path(filename).stem
|
|
if stem not in stem_to_paths:
|
|
stem_to_paths[stem] = []
|
|
stem_to_paths[stem].append(stem)
|
|
path_targets.add(stem)
|
|
|
|
# Collect errors
|
|
broken_links: list[tuple[str, int, str]] = []
|
|
ambiguous_links: list[tuple[str, int, str, list[str]]] = []
|
|
spaced_links: list[tuple[str, int, str]] = []
|
|
|
|
# Track linked stems for orphan detection
|
|
all_doc_stems: set[str] = set(stem_to_paths.keys())
|
|
linked_stems: set[str] = set()
|
|
|
|
for md_file in sorted(DOCS_DIR.rglob("*.md")):
|
|
if "changelog.d" in md_file.parts:
|
|
continue
|
|
|
|
rel_path = str(md_file.relative_to(DOCS_DIR))
|
|
source_stem = md_file.stem
|
|
links = extract_wikilinks(md_file)
|
|
|
|
for target, line_num, has_spaces in links:
|
|
if has_spaces:
|
|
spaced_links.append((rel_path, line_num, target))
|
|
continue
|
|
|
|
# Strip anchor fragment for file validation
|
|
file_target = target
|
|
if "#" in target:
|
|
file_target = target.split("#", 1)[0]
|
|
if not file_target:
|
|
# Pure in-page anchor like [[#Heading]] — always valid
|
|
continue
|
|
|
|
if "/" in file_target:
|
|
# Path-based link — resolve against path_targets
|
|
if file_target not in path_targets:
|
|
broken_links.append((rel_path, line_num, target))
|
|
else:
|
|
# Extract the stem for orphan tracking
|
|
linked_stem = file_target.rsplit("/", 1)[-1]
|
|
if linked_stem != source_stem:
|
|
linked_stems.add(linked_stem)
|
|
else:
|
|
# Bare stem link — check for existence and ambiguity
|
|
paths = stem_to_paths.get(file_target)
|
|
if paths is None:
|
|
broken_links.append((rel_path, line_num, target))
|
|
elif len(paths) > 1:
|
|
# Ambiguous: multiple files share this stem
|
|
ambiguous_links.append((rel_path, line_num, target, paths))
|
|
elif file_target != source_stem:
|
|
linked_stems.add(file_target)
|
|
|
|
# Print results
|
|
console.print("[bold]Wiki-Link Validation[/bold]")
|
|
console.print()
|
|
console.print(f"Found {len(path_targets)} valid link targets in documentation.")
|
|
console.print()
|
|
|
|
has_errors = False
|
|
|
|
if spaced_links:
|
|
has_errors = True
|
|
console.print("[bold red]Wiki-Links With Spaces Found[/bold red]")
|
|
console.print("Wiki-links must not have spaces in the target or around the pipe.")
|
|
console.print("Use [[target|Display Text]] not [[target | Display Text]].")
|
|
console.print()
|
|
table = Table(show_header=True, header_style="bold")
|
|
table.add_column("File")
|
|
table.add_column("Line", justify="right")
|
|
table.add_column("Target")
|
|
|
|
for file_path, line_num, target in spaced_links:
|
|
table.add_row(file_path, str(line_num), escape(f"[[{target}]]"))
|
|
|
|
console.print(table)
|
|
console.print()
|
|
|
|
if ambiguous_links:
|
|
has_errors = True
|
|
console.print("[bold red]Ambiguous Wiki-Links Found[/bold red]")
|
|
console.print("These bare-name links match multiple files.")
|
|
console.print("Use a path-based link to disambiguate: [[path/to/file]]")
|
|
console.print()
|
|
table = Table(show_header=True, header_style="bold")
|
|
table.add_column("File")
|
|
table.add_column("Line", justify="right")
|
|
table.add_column("Target")
|
|
table.add_column("Possible Paths")
|
|
|
|
for file_path, line_num, target, paths in ambiguous_links:
|
|
table.add_row(file_path, str(line_num), escape(f"[[{target}]]"), "\n".join(paths))
|
|
|
|
console.print(table)
|
|
console.print()
|
|
|
|
if broken_links:
|
|
has_errors = True
|
|
console.print("[bold red]Broken Wiki-Links Found[/bold red]")
|
|
table = Table(show_header=True, header_style="bold")
|
|
table.add_column("File")
|
|
table.add_column("Line", justify="right")
|
|
table.add_column("Target")
|
|
|
|
for file_path, line_num, target in broken_links:
|
|
table.add_row(file_path, str(line_num), escape(f"[[{target}]]"))
|
|
|
|
console.print(table)
|
|
console.print()
|
|
console.print("Each wiki-link target must match a filename stem or path in docs/.")
|
|
console.print()
|
|
|
|
# Orphan detection: docs not linked from any other doc
|
|
ORPHAN_EXCEPTIONS = {"index"}
|
|
orphan_stems = sorted(all_doc_stems - linked_stems - ORPHAN_EXCEPTIONS)
|
|
if orphan_stems:
|
|
has_errors = True
|
|
console.print("[bold red]Orphan Documents Found[/bold red]")
|
|
console.print("These docs are not linked from any other document.")
|
|
console.print()
|
|
table = Table(show_header=True, header_style="bold")
|
|
table.add_column("File")
|
|
table.add_column("Stem")
|
|
|
|
for stem in orphan_stems:
|
|
paths = stem_to_paths[stem]
|
|
for path in paths:
|
|
table.add_row(f"{path}.md", stem)
|
|
|
|
console.print(table)
|
|
console.print()
|
|
|
|
if has_errors:
|
|
return 1
|
|
|
|
console.print("[bold green]All wiki-links are valid![/bold green]")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|