mirror of
https://github.com/coder/coder.git
synced 2026-06-02 20:48:20 +00:00
ef2eb9f8d2
- Add `SanitizePromptText` stripping ~24 invisible Unicode codepoints and collapsing excessive newlines - Apply at write and read paths for defense-in-depth - Frontend: warn in both prompt textareas when invisible characters detected - Explicit codepoint list (not blanket `unicode.Cf`) to avoid breaking flag emoji - 34 Go tests + idempotency meta-test, 11 TS unit tests, 4 Storybook stories > This PR was created with the help of Coder Agents, and was reviewed by my human.
163 lines
4.6 KiB
Go
163 lines
4.6 KiB
Go
package chatd
|
||
|
||
import (
|
||
"strings"
|
||
"unicode"
|
||
)
|
||
|
||
// SanitizePromptText strips invisible Unicode characters that could
|
||
// hide prompt-injection content from human reviewers, normalizes line
|
||
// endings, collapses excessive blank lines, and trims surrounding
|
||
// whitespace.
|
||
//
|
||
// The stripped codepoints are truly invisible and have no legitimate
|
||
// use in prompt text. An explicit codepoint list is used rather than
|
||
// blanket unicode.Cf stripping to avoid breaking subdivision flag
|
||
// emoji (🏴) and other legitimate format characters.
|
||
//
|
||
// Note: U+200D (ZWJ) is stripped even though it joins compound emoji
|
||
// (e.g. 👨👩👦 → 👨👩👦). This is an acceptable trade-off because
|
||
// system prompts are not emoji art, and ZWJ is actively exploited in
|
||
// zero-width steganography schemes as a delimiter character.
|
||
func SanitizePromptText(s string) string {
|
||
// 1. Normalize line endings: \r\n → \n, lone \r → \n.
|
||
s = strings.ReplaceAll(s, "\r\n", "\n")
|
||
s = strings.ReplaceAll(s, "\r", "\n")
|
||
|
||
// 2. Strip invisible characters rune-by-rune.
|
||
var b strings.Builder
|
||
b.Grow(len(s))
|
||
for _, r := range s {
|
||
if !isVisible(r) {
|
||
continue
|
||
}
|
||
_, _ = b.WriteRune(r)
|
||
}
|
||
s = b.String()
|
||
|
||
// 3. Collapse 3+ consecutive newlines down to 2 (one blank
|
||
// line between paragraphs). This runs after invisible-char
|
||
// stripping so that lines containing only stripped chars
|
||
// become empty and get collapsed.
|
||
s = collapseNewlines(s)
|
||
|
||
// 4. Final trim.
|
||
return strings.TrimSpace(s)
|
||
}
|
||
|
||
// isVisible reports whether r is a visible Unicode character that
|
||
// should be preserved in prompt text. Each invisible range is
|
||
// documented with its Unicode name and rationale.
|
||
func isVisible(r rune) bool {
|
||
switch {
|
||
// Soft hyphen — invisible in most renderers, used to hide
|
||
// content boundaries.
|
||
case r == 0x00AD:
|
||
return false
|
||
|
||
// Combining grapheme joiner — invisible, no legitimate
|
||
// prompt use.
|
||
case r == 0x034F:
|
||
return false
|
||
|
||
// Arabic letter mark — bidi control, invisible.
|
||
case r == 0x061C:
|
||
return false
|
||
|
||
// Mongolian vowel separator — invisible spacing character.
|
||
case r == 0x180E:
|
||
return false
|
||
|
||
// Zero-width space (U+200B).
|
||
case r == 0x200B:
|
||
return false
|
||
|
||
// U+200C (ZWNJ) is deliberately NOT stripped. It is
|
||
// required for correct rendering of Persian, Urdu, and
|
||
// Kurdish scripts where it controls cursive joining.
|
||
// Stripping ZWS (U+200B) and ZWJ (U+200D) already breaks
|
||
// zero-width steganography encodings regardless of whether
|
||
// ZWNJ survives.
|
||
|
||
// Zero-width joiner (U+200D) — also used in compound emoji,
|
||
// but actively exploited in steganography. See
|
||
// SanitizePromptText doc comment.
|
||
case r == 0x200D:
|
||
return false
|
||
|
||
// Left-to-right mark (U+200E).
|
||
case r == 0x200E:
|
||
return false
|
||
|
||
// Right-to-left mark (U+200F).
|
||
case r == 0x200F:
|
||
return false
|
||
|
||
// Bidi embedding and override controls (U+202A–U+202E):
|
||
// LRE, RLE, PDF, LRO, RLO.
|
||
case r >= 0x202A && r <= 0x202E:
|
||
return false
|
||
|
||
// Word joiner and invisible operators (U+2060–U+2064):
|
||
// word joiner, function application, invisible times,
|
||
// invisible separator, invisible plus.
|
||
case r >= 0x2060 && r <= 0x2064:
|
||
return false
|
||
|
||
// Bidi isolate controls (U+2066–U+2069):
|
||
// LRI, RLI, FSI, PDI.
|
||
case r >= 0x2066 && r <= 0x2069:
|
||
return false
|
||
|
||
// Deprecated format characters (U+206A–U+206F): inhibit
|
||
// symmetric swapping through nominal digit shapes.
|
||
case r >= 0x206A && r <= 0x206F:
|
||
return false
|
||
|
||
// Byte order mark / zero-width no-break space (U+FEFF).
|
||
// Common at start of Windows-edited files.
|
||
case r == 0xFEFF:
|
||
return false
|
||
|
||
// Interlinear annotation anchor, separator, and
|
||
// terminator (U+FFF9–U+FFFB).
|
||
case r >= 0xFFF9 && r <= 0xFFFB:
|
||
return false
|
||
|
||
default:
|
||
return true
|
||
}
|
||
}
|
||
|
||
// collapseNewlines replaces runs of 3 or more consecutive newlines
|
||
// with exactly 2, preserving single blank lines (paragraph breaks)
|
||
// while eliminating scroll-padding attacks. Trailing whitespace on
|
||
// each line is stripped first so that whitespace-only lines become
|
||
// empty and collapse naturally.
|
||
func collapseNewlines(s string) string {
|
||
// Step 1: Trim trailing whitespace from each line, preserving
|
||
// leading whitespace for indentation.
|
||
lines := strings.Split(s, "\n")
|
||
for i, line := range lines {
|
||
lines[i] = strings.TrimRightFunc(line, unicode.IsSpace)
|
||
}
|
||
s = strings.Join(lines, "\n")
|
||
|
||
// Step 2: Collapse runs of 3+ consecutive newlines down to 2.
|
||
var b strings.Builder
|
||
b.Grow(len(s))
|
||
consecutiveNewlines := 0
|
||
for _, r := range s {
|
||
if r == '\n' {
|
||
consecutiveNewlines++
|
||
if consecutiveNewlines <= 2 {
|
||
_, _ = b.WriteRune(r)
|
||
}
|
||
continue
|
||
}
|
||
consecutiveNewlines = 0
|
||
_, _ = b.WriteRune(r)
|
||
}
|
||
return b.String()
|
||
}
|