diff --git a/.github/workflows/lint-external-links.yml b/.github/workflows/lint-external-links.yml new file mode 100644 index 00000000000000..5dfe3d75ebf11d --- /dev/null +++ b/.github/workflows/lint-external-links.yml @@ -0,0 +1,97 @@ +name: Check External Links + +on: + # Run weekly on Sundays at 2 AM UTC + schedule: + - cron: '0 2 * * 0' + + # Allow manual triggering + workflow_dispatch: + + # Run on PRs that modify docs (non-blocking) + pull_request: + branches: [master] + +jobs: + # Job for PRs: check only changed files + check-pr: + if: github.event_name == 'pull_request' + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Get changed files + id: changed + run: | + FILES=$(git diff --name-only --diff-filter=AM origin/${{ github.base_ref }}...HEAD -- '*.md' '*.mdx' || true) + if [ -z "$FILES" ]; then + echo "files=" >> $GITHUB_OUTPUT + echo "No markdown files changed" + else + echo "files<> $GITHUB_OUTPUT + echo "$FILES" >> $GITHUB_OUTPUT + echo "EOF" >> $GITHUB_OUTPUT + echo "Changed files:" + echo "$FILES" + fi + + - name: Restore lychee cache + if: steps.changed.outputs.files != '' + uses: actions/cache/restore@v4 + with: + path: .lycheecache + key: lychee-cache- + restore-keys: lychee-cache- + + - name: Check external links + if: steps.changed.outputs.files != '' + uses: lycheeverse/lychee-action@v2 + with: + args: --verbose --no-progress ${{ steps.changed.outputs.files }} + fail: true + jobSummary: true + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + # Job for scheduled/manual runs: check all files, create issue + check-full: + if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' + runs-on: ubuntu-latest + permissions: + issues: write + + steps: + - uses: actions/checkout@v4 + + # Cache strategy: see lychee.toml for details + # - Restore previous cache so successful checks are skipped + # - Transient errors (429, 5xx) are excluded from cache and retried + # - Save updated cache for next run + - name: Restore lychee cache + uses: actions/cache/restore@v4 + with: + path: .lycheecache + key: lychee-cache- + restore-keys: lychee-cache- + + - name: Check external links + id: lychee + uses: lycheeverse/lychee-action@v2 + with: + args: --verbose . + output: ./lychee-report.md + format: markdown + fail: true + jobSummary: true + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Save lychee cache + uses: actions/cache/save@v4 + if: always() + with: + path: .lycheecache + key: lychee-cache-${{ github.run_id }} diff --git a/.gitignore b/.gitignore index d9228e6660908a..37df37dde887f0 100644 --- a/.gitignore +++ b/.gitignore @@ -105,3 +105,6 @@ public/og-images/* yalc.lock /public/doctree.json /public/doctree-dev.json + +# Lychee cache +.lycheecache diff --git a/.lycheeignore b/.lycheeignore new file mode 100644 index 00000000000000..6416d8fc381fab --- /dev/null +++ b/.lycheeignore @@ -0,0 +1,58 @@ +# URLs to ignore during external link checking +# Supports regex patterns - lines starting with # are comments +# Note: Private IPs (localhost, 10.x, 172.16-31.x, 192.168.x) are handled by exclude_all_private in lychee.toml + +# Example/placeholder URLs +https?://example\.com.* +https?://your-.* +https?://.*\.example\..* +https?://___.*___.* + +# Internal Sentry development URLs +https?://.*\.getsentry\.net.* +https?://sentry-content-dashboard\.sentry\.dev.* + +# Sites known to block automated checkers +https?://twitter\.com.* +https?://x\.com.* +https?://linkedin\.com.* +https?://www\.linkedin\.com.* +https?://www\.npmjs\.com.* +https?://search\.maven\.org.* +https?://medium\.com.* +https?://.*\.medium\.com.* +https?://gitlab\.com/oauth/.* +https?://docs\.gitlab\.com.* +https?://dev\.epicgames\.com.* +https?://docs\.unrealengine\.com.* +https?://cursor\.com.* +https?://dash\.cloudflare\.com.* +https?://www\.freedesktop\.org.* + +# TLS compatibility issues (sites work in browser but fail in lychee due to native-tls) +# bottlepy.org only supports TLS 1.3, incompatible with lychee's TLS backend +https?://bottlepy\.org.* + +# Cloudflare ECH (Encrypted Client Hello) required - fails even with curl/openssl +https?://help\.revise\.dev.* +https?://.*\.intercomhelpcenter\.com.* + +# Rate-limited sites (may fail intermittently with 429) +https?://godoc\.org.* +https?://pkg\.go\.dev.* + +# Interactive demos that may not respond to HEAD requests +https?://demo\.arcade\.software.* + +# Private/internal resources +https?://.*\.notion\.so.* +https?://www\.notion\.so.* +https?://github\.com/getsentry/getsentry.* +https?://github\.com/getsentry/sentry-options-automator.* +https?://github\.com/getsentry/etl.* +https?://sentry\.zendesk\.com.* + +# Placeholder domains commonly used in docs +https?://api\.example\.com.* +https?://your-api-host.* +https?://empowerplant\.io.* diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b378cd172d02ec..214f4e5198a41e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -34,3 +34,11 @@ repos: rev: v1.39.0 hooks: - id: typos + - repo: local + hooks: + - id: lychee + name: Check external links (warn only) + entry: bun scripts/lint-external-links.ts + language: system + files: \.(md|mdx)$ + verbose: true diff --git a/lychee.toml b/lychee.toml new file mode 100644 index 00000000000000..e8691b8643e303 --- /dev/null +++ b/lychee.toml @@ -0,0 +1,58 @@ +# Lychee configuration for external link checking +# Documentation: https://github.com/lycheeverse/lychee + +# Base URL to resolve root-relative links +base_url = "https://docs.sentry.io" + +# Only check HTTP and HTTPS links +scheme = ["https", "http"] + +# Exclude all private IP addresses automatically (localhost, 10.x, 172.16-31.x, 192.168.x, etc.) +exclude_all_private = true + +# Exclude internal links (already handled by lint-404s script) +exclude = ['^https://docs\.sentry\.io'] + +# Maximum number of concurrent requests +max_concurrency = 32 + +# Maximum number of retries per request +max_retries = 2 + +# Request timeout in seconds +timeout = 30 + +# Retry wait time in seconds +retry_wait_time = 2 + +# User agent (some sites block default user agents) +user_agent = "Mozilla/5.0 (compatible; Sentry-Docs-Link-Checker; +https://github.com/getsentry/sentry-docs)" + +# Accept common status codes that indicate the link works +# Include 403 (possibly bot blocking) and 418 (freedesktop teapot) to reduce noise +accept = [200, 201, 202, 203, 204, 206, 301, 302, 308, 403, 418] + +# Don't validate URL fragments/anchors (e.g., #section-name) +# Fragment checking is unreliable: JS-rendered anchors appear broken, and many sites don't validate them +include_fragments = false + +# Only check external links (our internal check handles internal ones) +include_mail = false +include_verbatim = false + +# Follow redirects +max_redirects = 10 + +# Cache settings +# +# Strategy: Weekly scheduled runs populate the cache, PR checks consume it. +# - Successful responses (200, 301, 403, 404) are cached and skipped on subsequent runs +# - Transient errors (429 rate limits, 5xx server errors) are NOT cached, so they get retried +# - Cache lifetime is just under 2 weeks so it survives between weekly runs +# +# This means each weekly run only re-checks: +# 1. Links that failed with transient errors last time +# 2. New links not yet in cache +cache = true +max_cache_age = "335h" +cache_exclude_status = "429, 500.." diff --git a/scripts/lint-404s/README.md b/scripts/lint-404s/README.md index 6db24d2b8955df..ed637b535ad49f 100644 --- a/scripts/lint-404s/README.md +++ b/scripts/lint-404s/README.md @@ -63,3 +63,42 @@ The `ignore-list.txt` file contains paths that should be skipped during checking - `0` - No 404s found - `1` - 404s were detected + +## External Link Checking + +This script only checks **internal links**. External links (to third-party sites) are validated separately using [lychee](https://github.com/lycheeverse/lychee). + +### Running Locally + +```bash +# Install lychee +brew install lychee + +# Check all markdown files in the repo +lychee . + +# Check a specific file +lychee docs/platforms/javascript/index.mdx +``` + +### Pre-commit Hook + +A pre-commit hook checks external links in changed files (warn-only, won't block commits). Requires lychee to be installed locally. + +### CI Workflow + +The GitHub workflow (`.github/workflows/lint-external-links.yml`) runs: + +- Weekly on a schedule (creates/updates issue with broken links) +- On PRs (checks changed files only) +- Manually via workflow dispatch + +### Configuration Files + +- `lychee.toml` - Lychee configuration +- `.lycheeignore` - URLs to ignore during checking + +### Why Separate from Internal Link Checking? + +1. **False positives**: Many external sites block automated checkers +2. **Different scope**: External checks only run on changed files in PRs; internal checks validate all pages diff --git a/scripts/lint-external-links.ts b/scripts/lint-external-links.ts new file mode 100644 index 00000000000000..baa947f3e4e1a2 --- /dev/null +++ b/scripts/lint-external-links.ts @@ -0,0 +1,40 @@ +/** + * Pre-commit hook wrapper for lychee external link checker. + * Runs lychee on provided files and warns on broken links without blocking commits. + * + * Usage: bun scripts/lint-external-links.ts [files...] + */ + +import {spawnSync} from 'child_process'; + +// Check if lychee is installed +const versionCheck = spawnSync('lychee', ['--version'], { + encoding: 'utf-8', + stdio: 'pipe', +}); +if (versionCheck.error || versionCheck.status !== 0) { + console.log('Warning: lychee not installed. Skipping external link check.'); + console.log( + 'Install with: brew install lychee (macOS) or cargo install lychee (cross-platform)' + ); + process.exit(0); +} + +const files = process.argv.slice(2); +if (files.length === 0) { + process.exit(0); +} + +// Run lychee on the provided files +const result = spawnSync('lychee', ['--no-progress', ...files], { + stdio: 'inherit', + encoding: 'utf-8', +}); + +if (result.status !== 0) { + console.log(''); + console.log('⚠️ External link issues found (commit not blocked)'); +} + +// Always exit 0 so commit proceeds +process.exit(0);