Files
coder/scripts/check_emdash.sh
Mathias Fredriksson 3770176b7f fix(scripts): use merge-base in emdash lint to avoid false positives (#25726)
When GITHUB_BASE_REF is set, the emdash lint compared against the tip
of main instead of the merge-base. For PRs behind main, this produced
a diff covering all divergent files, flagging pre-existing emdashes the
PR never touched.

Query the PR commit count via gh, deepen HEAD by that amount, and
resolve HEAD~N as the merge-base. Falls back to the branch tip when
the merge-base cannot be determined.
2026-05-28 13:45:01 +03:00

204 lines
5.7 KiB
Bash
Executable File

#!/usr/bin/env bash
set -euo pipefail
# shellcheck source=scripts/lib.sh
source "$(dirname "${BASH_SOURCE[0]}")/lib.sh"
cdroot
echo "--- check for emdash/endash characters"
mode="changed"
for arg in "$@"; do
if [[ "$arg" == "--all" ]]; then
mode="all"
fi
done
# Build the pattern from raw bytes so the script itself does not
# contain literal emdash/endash characters (which would trigger
# the check when the script is in the diff).
emdash=$'\xE2\x80\x94'
endash=$'\xE2\x80\x93'
pattern="${emdash}|${endash}"
# Git exclude_pathspecs excluded from the check. Used in both ls-files and diff comparison.
exclude_pathspecs=(
":(exclude)aibridge/fixtures/**/*.txtar"
# Generated CLI golden files embed serpent's emdash-bordered footer.
":(exclude)cli/testdata/*.golden"
":(exclude)enterprise/cli/testdata/*.golden"
)
scan_all_files() {
local output
output=$(git ls-files -z -- "${exclude_pathspecs[@]}" | xargs -0 grep -IEn "$pattern" 2>/dev/null || true)
if [[ -n "$output" ]]; then
echo "$output"
found=1
else
found=0
fi
}
# resolve_merge_base finds the merge-base between HEAD and the given ref.
# In shallow CI clones the merge-base is not directly reachable, so we
# query the PR commit count via `gh`, deepen HEAD by count+1, and
# resolve HEAD~N which is the parent of the first PR commit.
resolve_merge_base() {
local base_ref="$1"
# Fast path: merge-base already reachable (full clone or sufficient depth).
local mb
mb=$(git merge-base HEAD "$base_ref" 2>/dev/null || true)
if [[ -n "$mb" ]]; then
echo "$mb"
return
fi
if ! command -v gh >/dev/null 2>&1; then
echo "gh CLI not found, cannot determine PR commit count." >&2
return
fi
# Use the PR commit count to deepen HEAD past the PR commits.
# HEAD~N is the parent of the oldest PR commit, i.e. the merge-base.
local count
count=$(gh pr view --json commits --jq '.commits | length' 2>/dev/null || true)
if [[ -z "$count" || "$count" -le 0 ]]; then
echo "Could not determine PR commit count from gh." >&2
return
fi
echo "Deepening HEAD by $((count + 1)) to reach PR base..." >&2
git fetch --deepen="$((count + 1))" 2>/dev/null || true
# Retry merge-base now that we have more history.
mb=$(git merge-base HEAD "$base_ref" 2>/dev/null || true)
if [[ -n "$mb" ]]; then
echo "$mb"
return
fi
# Last resort: walk first-parent history. This is correct for
# linear PRs but may traverse the wrong branch for merge-commit
# checkouts.
git rev-parse --verify "HEAD~${count}" 2>/dev/null || true
}
# fetch_base_ref ensures origin/$GITHUB_BASE_REF is available locally.
# CI shallow clones (fetch-depth: 1) typically omit the base branch.
fetch_base_ref() {
local base_ref="$1"
if git rev-parse --verify "$base_ref" >/dev/null 2>&1; then
return 0
fi
local ref="${base_ref#origin/}"
echo "Base ref $base_ref not found locally, fetching $ref..." >&2
git fetch origin "$ref" --depth=1 2>/dev/null || true
if ! git rev-parse --verify "$base_ref" >/dev/null 2>&1; then
echo "ERROR: could not fetch base ref $base_ref." >&2
return 1
fi
}
# resolve_diff_base determines the base ref to diff against.
resolve_diff_base() {
# CI pull requests: use merge-base against the target branch.
if [[ -n "${GITHUB_BASE_REF:-}" ]]; then
local base_ref="origin/${GITHUB_BASE_REF}"
fetch_base_ref "$base_ref" || return 1
local base
base=$(resolve_merge_base "$base_ref")
if [[ -n "$base" ]]; then
echo "$base"
return
fi
# Could not determine merge-base; fall back to branch tip.
echo "WARNING: could not find merge-base with $base_ref, using branch tip (diff may include non-PR changes)." >&2
echo "$base_ref"
return
fi
# Local dev: use merge-base with origin/main.
if git rev-parse --verify origin/main >/dev/null 2>&1; then
git merge-base HEAD origin/main 2>/dev/null || echo "origin/main"
return
fi
}
# scan_diff checks only added lines in the diff for emdash/endash.
scan_diff() {
local base="$1"
local diff_output
if ! diff_output=$(git diff "$base" -U0 -- . "${exclude_pathspecs[@]}" 2>&1); then
echo "ERROR: git diff against $base failed:" >&2
echo "$diff_output" >&2
exit 1
fi
if [[ -z "$diff_output" ]]; then
echo "OK: no changes to check."
exit 0
fi
local current_file="" current_line=0
while IFS= read -r diff_line; do
if [[ "$diff_line" =~ ^\+\+\+\ b/(.*) ]]; then
current_file="${BASH_REMATCH[1]}"
fi
# Anchored to hunk header structure to avoid matching
# digits from trailing function context.
if [[ "$diff_line" =~ ^@@\ -[0-9,]+\ \+([0-9]+) ]]; then
current_line=${BASH_REMATCH[1]}
continue
fi
if [[ "$diff_line" =~ ^\+ ]] && [[ ! "$diff_line" =~ ^\+\+\+\ [ab/] ]]; then
if echo "$diff_line" | grep -Eq "$pattern"; then
echo "${current_file}:${current_line}:${diff_line:1}"
found=1
fi
((current_line++)) || true
fi
done <<<"$diff_output"
}
if [[ "$mode" == "all" ]]; then
scan_all_files
else
base=$(resolve_diff_base) || {
echo "ERROR: could not determine base ref." >&2
exit 1
}
if [[ -z "$base" ]]; then
echo "WARNING: no base ref found, scanning all tracked files." >&2
scan_all_files
else
found=0
scan_diff "$base"
fi
fi
if [[ "$found" -ne 0 ]]; then
echo ""
echo "ERROR: Found emdash (U+2014) or endash (U+2013) characters."
echo ""
echo " Do not use emdash or endash in code, comments, string literals,"
echo " or documentation. Use commas, semicolons, or periods instead."
echo " Restructure the sentence if needed. Do not replace them with"
echo " ' -- ' either."
echo ""
echo " Example:"
echo " Bad: This is slow [emdash] we should cache it."
echo " Good: This is slow. We should cache it."
echo " Good: This is slow, so we should cache it."
echo ""
exit 1
fi
echo "OK: no emdash or endash characters found."