coder/site/bin.go

package site

import (
	"archive/tar"
	"bytes"
	"crypto/sha1" // nolint: gosec // not used for cryptography
	"encoding/hex"
	"errors"
	"fmt"
	"io"
	"io/fs"
	"net/http"
	"os"
	"path"
	"path/filepath"
	"slices"
	"strings"
	"sync"

	"github.com/andybalholm/brotli"
	"github.com/klauspost/compress/zstd"
	"golang.org/x/sync/errgroup"
	"golang.org/x/sync/singleflight"
	"golang.org/x/xerrors"

	"github.com/coder/coder/v2/coderd/cachecompress"
)

const CompressionLevel = 5

// errHashMismatch is a sentinel error used in verifyBinSha1IsCurrent.
var errHashMismatch = xerrors.New("hash mismatch")

type binHandler struct {
	metadataCache *binMetadataCache
	handler       http.Handler
}

var StandardEncoders = map[string]func(w io.Writer, level int) io.WriteCloser{
	"br": func(w io.Writer, level int) io.WriteCloser {
		return brotli.NewWriterLevel(w, level)
	},
	"zstd": func(w io.Writer, level int) io.WriteCloser {
		zw, err := zstd.NewWriter(w, zstd.WithEncoderLevel(zstd.EncoderLevelFromZstd(level)))
		if err != nil {
			panic("invalid zstd compressor: " + err.Error())
		}
		return zw
	},
}

func (h *binHandler) ServeHTTP(rw http.ResponseWriter, r *http.Request) {
	if !strings.HasPrefix(r.URL.Path, "/bin/") {
		rw.WriteHeader(http.StatusNotFound)
		_, _ = rw.Write([]byte("not found"))
		return
	}
	r.URL.Path = strings.TrimPrefix(r.URL.Path, "/bin")
	// Convert underscores in the filename to hyphens. We eventually want to
	// change our hyphen-based filenames to underscores, but we need to
	// support both for now.
	r.URL.Path = strings.ReplaceAll(r.URL.Path, "_", "-")

	// Set ETag header to the SHA1 hash of the file contents.
	name := filePath(r.URL.Path)
	if name == "" || name == "/" {
		// Serve the directory listing. This intentionally allows directory listings to
		// be served. This file system should not contain anything sensitive.
		h.handler.ServeHTTP(rw, r)
		return
	}
	if strings.Contains(name, "/") {
		// We only serve files from the root of this directory, so avoid any
		// shenanigans by blocking slashes in the URL path.
		http.NotFound(rw, r)
		return
	}

	metadata, err := h.metadataCache.getMetadata(name)
	if xerrors.Is(err, os.ErrNotExist) {
		http.NotFound(rw, r)
		return
	}
	if err != nil {
		http.Error(rw, err.Error(), http.StatusInternalServerError)
		return
	}

	// http.FileServer will not set Content-Length when performing chunked
	// transport encoding, which is used for large files like our binaries
	// so stream compression can be used.
	//
	// Clients like IDE extensions and the desktop apps can compare the
	// value of this header with the amount of bytes written to disk after
	// decompression to show progress. Without this, they cannot show
	// progress without disabling compression.
	//
	// There isn't really a spec for a length header for the "inner" content
	// size, but some nginx modules use this header.
	rw.Header().Set("X-Original-Content-Length", fmt.Sprintf("%d", metadata.sizeBytes))

	// Get and set ETag header. Must be quoted.
	rw.Header().Set("ETag", fmt.Sprintf(`%q`, metadata.sha1Hash))

	// http.FileServer will see the ETag header and automatically handle
	// If-Match and If-None-Match headers on the request properly.
	h.handler.ServeHTTP(rw, r)
}

func newBinHandler(options *Options) (*binHandler, error) {
	cacheDir := options.CacheDir
	compressedCacheDir := ""
	if cacheDir != "" {
		// split the cache dir into ./compressed and ./orig containing the compressed files and the original
		// uncompressed files respectively.
		compressedCacheDir = filepath.Join(cacheDir, "compressed")
		err := os.MkdirAll(compressedCacheDir, 0o700)
		if err != nil {
			// cached dir was provided, but we can't write to it
			return nil, xerrors.Errorf("failed to create compressed directory in cache dir: %w", err)
		}
		cacheDir = filepath.Join(cacheDir, "orig")
		err = os.MkdirAll(cacheDir, 0o700)
		if err != nil {
			return nil, xerrors.Errorf("failed to create orig directory in cache dir: %w", err)
		}
	}
	// note that ExtractOrReadBinFS handles an empty cacheDir; this often arises in testing.
	binFS, binHashes, err := ExtractOrReadBinFS(cacheDir, options.SiteFS)
	if err != nil {
		return nil, xerrors.Errorf("extract or read bin filesystem: %w", err)
	}
	h := &binHandler{
		metadataCache: newBinMetadataCache(binFS, binHashes),
	}
	if compressedCacheDir != "" {
		cmp := cachecompress.NewCompressor(options.Logger, CompressionLevel, compressedCacheDir, binFS)
		for encoding, fn := range StandardEncoders {
			cmp.SetEncoder(encoding, fn)
		}
		h.handler = cmp
	} else {
		h.handler = http.FileServer(binFS)
	}
	return h, nil
}

// ExtractOrReadBinFS checks the provided fs for compressed coder binaries and
// extracts them into dest/bin if found. As a fallback, the provided FS is
// checked for a /bin directory, if it is non-empty it is returned. Finally
// dest/bin is returned as a fallback allowing binaries to be manually placed in
// dest (usually ${CODER_CACHE_DIRECTORY}/site/orig/bin).
//
// Returns a http.FileSystem that serves unpacked binaries, and a map of binary
// name to SHA1 hash. The returned hash map may be incomplete or contain hashes
// for missing files.
func ExtractOrReadBinFS(dest string, siteFS fs.FS) (http.FileSystem, map[string]string, error) {
	if dest == "" {
		// No destination on fs, embedded fs is the only option.
		binFS, err := fs.Sub(siteFS, "bin")
		if err != nil {
			return nil, nil, xerrors.Errorf("cache path is empty and embedded fs does not have /bin: %w", err)
		}
		return http.FS(binFS), nil, nil
	}

	dest = filepath.Join(dest, "bin")
	mkdest := func() (http.FileSystem, error) {
		err := os.MkdirAll(dest, 0o700)
		if err != nil {
			return nil, xerrors.Errorf("mkdir failed: %w", err)
		}
		return http.Dir(dest), nil
	}

	archive, err := siteFS.Open("bin/coder.tar.zst")
	if err != nil {
		if xerrors.Is(err, fs.ErrNotExist) {
			files, err := fs.ReadDir(siteFS, "bin")
			if err != nil {
				if xerrors.Is(err, fs.ErrNotExist) {
					// Given fs does not have a bin directory, serve from cache
					// directory without extracting anything.
					binFS, err := mkdest()
					if err != nil {
						return nil, nil, xerrors.Errorf("mkdest failed: %w", err)
					}
					return binFS, map[string]string{}, nil
				}
				return nil, nil, xerrors.Errorf("site fs read dir failed: %w", err)
			}

			if len(filterFiles(files, "GITKEEP")) > 0 {
				// If there are other files than bin/GITKEEP, serve the files.
				binFS, err := fs.Sub(siteFS, "bin")
				if err != nil {
					return nil, nil, xerrors.Errorf("site fs sub dir failed: %w", err)
				}
				return http.FS(binFS), nil, nil
			}

			// Nothing we can do, serve the cache directory, thus allowing
			// binaries to be placed there.
			binFS, err := mkdest()
			if err != nil {
				return nil, nil, xerrors.Errorf("mkdest failed: %w", err)
			}
			return binFS, map[string]string{}, nil
		}
		return nil, nil, xerrors.Errorf("open coder binary archive failed: %w", err)
	}
	defer archive.Close()

	binFS, err := mkdest()
	if err != nil {
		return nil, nil, err
	}

	shaFiles, err := parseSHA1(siteFS)
	if err != nil {
		return nil, nil, xerrors.Errorf("parse sha1 file failed: %w", err)
	}

	ok, err := verifyBinSha1IsCurrent(dest, siteFS, shaFiles)
	if err != nil {
		return nil, nil, xerrors.Errorf("verify coder binaries sha1 failed: %w", err)
	}
	if !ok {
		n, err := extractBin(dest, archive)
		if err != nil {
			return nil, nil, xerrors.Errorf("extract coder binaries failed: %w", err)
		}
		if n == 0 {
			return nil, nil, xerrors.New("no files were extracted from coder binaries archive")
		}
	}

	return binFS, shaFiles, nil
}

func extractBin(dest string, r io.Reader) (numExtracted int, err error) {
	opts := []zstd.DOption{
		// Concurrency doesn't help us when decoding the tar and
		// can actually slow us down.
		zstd.WithDecoderConcurrency(1),
		// Ignoring checksums can give a slight performance
		// boost but it's probably not worth the reduced safety.
		zstd.IgnoreChecksum(false),
		// Allow the decoder to use more memory giving us a 2-3x
		// performance boost.
		zstd.WithDecoderLowmem(false),
	}
	zr, err := zstd.NewReader(r, opts...)
	if err != nil {
		return 0, xerrors.Errorf("open zstd archive failed: %w", err)
	}
	defer zr.Close()

	tr := tar.NewReader(zr)
	n := 0
	for {
		h, err := tr.Next()
		if err != nil {
			if errors.Is(err, io.EOF) {
				return n, nil
			}
			return n, xerrors.Errorf("read tar archive failed: %w", err)
		}
		if h.Name == "." || strings.Contains(h.Name, "..") {
			continue
		}

		name := filepath.Join(dest, filepath.Base(h.Name))
		f, err := os.Create(name)
		if err != nil {
			return n, xerrors.Errorf("create file failed: %w", err)
		}
		//#nosec // We created this tar, no risk of decompression bomb.
		_, err = io.Copy(f, tr)
		if err != nil {
			_ = f.Close()
			return n, xerrors.Errorf("write file contents failed: %w", err)
		}
		err = f.Close()
		if err != nil {
			return n, xerrors.Errorf("close file failed: %w", err)
		}

		n++
	}
}

type binMetadata struct {
	sizeBytes int64 // -1 if not known yet
	// SHA1 was chosen because it's fast to compute and reasonable for
	// determining if a file has changed. The ETag is not used a security
	// measure.
	sha1Hash string // always set if in the cache
}

type binMetadataCache struct {
	binFS          http.FileSystem
	originalHashes map[string]string

	metadata map[string]binMetadata
	mut      sync.RWMutex
	sf       singleflight.Group
	sem      chan struct{}
}

func newBinMetadataCache(binFS http.FileSystem, binSha1Hashes map[string]string) *binMetadataCache {
	b := &binMetadataCache{
		binFS:          binFS,
		originalHashes: make(map[string]string, len(binSha1Hashes)),

		metadata: make(map[string]binMetadata, len(binSha1Hashes)),
		mut:      sync.RWMutex{},
		sf:       singleflight.Group{},
		sem:      make(chan struct{}, 4),
	}

	// Previously we copied binSha1Hashes to the cache immediately. Since we now
	// read other information like size from the file, we can't do that. Instead
	// we copy the hashes to a different map that will be used to populate the
	// cache on the first request.
	for k, v := range binSha1Hashes {
		b.originalHashes[k] = v
	}

	return b
}

func (b *binMetadataCache) getMetadata(name string) (binMetadata, error) {
	b.mut.RLock()
	metadata, ok := b.metadata[name]
	b.mut.RUnlock()
	if ok {
		return metadata, nil
	}

	// Avoid DOS by using a pool, and only doing work once per file.
	v, err, _ := b.sf.Do(name, func() (any, error) {
		b.sem <- struct{}{}
		defer func() { <-b.sem }()

		// Reject any invalid or non-basename paths before touching the filesystem.
		if name == "" ||
			name == "." ||
			strings.Contains(name, "/") ||
			strings.Contains(name, "\\") ||
			!fs.ValidPath(name) ||
			path.Base(name) != name {
			return binMetadata{}, os.ErrNotExist
		}

		f, err := b.binFS.Open(name)
		if err != nil {
			return binMetadata{}, err
		}
		defer f.Close()

		var metadata binMetadata

		stat, err := f.Stat()
		if err != nil {
			return binMetadata{}, err
		}
		metadata.sizeBytes = stat.Size()

		if hash, ok := b.originalHashes[name]; ok {
			metadata.sha1Hash = hash
		} else {
			h := sha1.New() //#nosec // Not used for cryptography.
			_, err := io.Copy(h, f)
			if err != nil {
				return binMetadata{}, err
			}
			metadata.sha1Hash = hex.EncodeToString(h.Sum(nil))
		}

		b.mut.Lock()
		b.metadata[name] = metadata
		b.mut.Unlock()
		return metadata, nil
	})
	if err != nil {
		return binMetadata{}, err
	}

	//nolint:forcetypeassert
	return v.(binMetadata), nil
}

func filterFiles(files []fs.DirEntry, names ...string) []fs.DirEntry {
	var filtered []fs.DirEntry
	for _, f := range files {
		if slices.Contains(names, f.Name()) {
			continue
		}
		filtered = append(filtered, f)
	}
	return filtered
}

func verifyBinSha1IsCurrent(dest string, siteFS fs.FS, shaFiles map[string]string) (ok bool, err error) {
	b1, err := fs.ReadFile(siteFS, "bin/coder.sha1")
	if err != nil {
		return false, xerrors.Errorf("read coder sha1 from embedded fs failed: %w", err)
	}
	b2, err := os.ReadFile(filepath.Join(dest, "coder.sha1"))
	if err != nil {
		if xerrors.Is(err, fs.ErrNotExist) {
			return false, nil
		}
		return false, xerrors.Errorf("read coder sha1 failed: %w", err)
	}

	// Check shasum files for equality for early-exit.
	if !bytes.Equal(b1, b2) {
		return false, nil
	}

	var eg errgroup.Group
	// Speed up startup by verifying files concurrently. Concurrency
	// is limited to save resources / early-exit. Early-exit speed
	// could be improved by using a context aware io.Reader and
	// passing the context from errgroup.WithContext.
	eg.SetLimit(3)

	// Verify the hash of each on-disk binary.
	for file, hash1 := range shaFiles {
		eg.Go(func() error {
			hash2, err := sha1HashFile(filepath.Join(dest, file))
			if err != nil {
				if xerrors.Is(err, fs.ErrNotExist) {
					return errHashMismatch
				}
				return xerrors.Errorf("hash file failed: %w", err)
			}
			if !strings.EqualFold(hash1, hash2) {
				return errHashMismatch
			}
			return nil
		})
	}
	err = eg.Wait()
	if err != nil {
		if xerrors.Is(err, errHashMismatch) {
			return false, nil
		}
		return false, err
	}

	return true, nil
}

// sha1HashFile computes a SHA1 hash of the file, returning the hex
// representation.
func sha1HashFile(name string) (string, error) {
	//#nosec // Not used for cryptography.
	hash := sha1.New()
	f, err := os.Open(name)
	if err != nil {
		return "", err
	}
	defer f.Close()

	_, err = io.Copy(hash, f)
	if err != nil {
		return "", err
	}

	b := make([]byte, hash.Size())
	hash.Sum(b[:0])

	return hex.EncodeToString(b), nil
}

func parseSHA1(siteFS fs.FS) (map[string]string, error) {
	b, err := fs.ReadFile(siteFS, "bin/coder.sha1")
	if err != nil {
		return nil, xerrors.Errorf("read coder sha1 from embedded fs failed: %w", err)
	}

	shaFiles := make(map[string]string)
	for _, line := range bytes.Split(bytes.TrimSpace(b), []byte{'\n'}) {
		parts := bytes.Split(line, []byte{' ', '*'})
		if len(parts) != 2 {
			return nil, xerrors.Errorf("malformed sha1 file: %w", err)
		}
		shaFiles[string(parts[1])] = strings.ToLower(string(parts[0]))
	}
	if len(shaFiles) == 0 {
		return nil, xerrors.Errorf("empty sha1 file: %w", err)
	}

	return shaFiles, nil
}