vercel/next.js / run-evals.js

run-evals.js201 lines6.4 KB
#!/usr/bin/env node
// @ts-check
/**
 * Pack the locally-built `next` package and run agent evals against it.
 *
 *   pnpm eval <eval-name>             run one eval, both variants (baseline + AGENTS.md)
 *   pnpm eval <eval-name> --dry       preview without executing
 *   pnpm eval --all                   run every eval (slow — normally only CI does this)
 *   NEXT_SKIP_PACK=1 pnpm eval ...    reuse tarball from last run
 *
 * Mirrors run-tests.js: pack once, hand paths to child via env, forward args.
 *
 * We only pack `next`, not the whole workspace. The sandbox is remote Linux:
 *   - @next/swc: local darwin binary wouldn't run there; the sandbox downloads
 *     the right one at runtime (packages/next/src/build/swc/index.ts).
 *   - @next/env etc: resolved from npm at the pinned canary version.
 *
 * The experiments/ dir is generated fresh on every run and gitignored. This
 * keeps the two variants (baseline vs. AGENTS.md) in one place instead of
 * maintaining N committed experiment files that only differ by one line.
 */
const path = require('path')
const fs = require('fs')
const { execFileSync, spawnSync } = require('child_process')

const ROOT = __dirname

const EVALS_DIR = path.join(ROOT, 'evals')
const FIXTURES_DIR = path.join(EVALS_DIR, 'evals')
const EXPERIMENTS_DIR = path.join(EVALS_DIR, 'experiments')
const TARBALL_DIR = path.join(EVALS_DIR, '.tarballs')
const TARBALL = path.join(TARBALL_DIR, 'next.tgz')

// The two variants we always compare. Order matters for output readability:
// baseline first so a contributor sees "does the agent fail without docs?"
// before "does it pass with docs?".
const VARIANTS = [
  {
    suffix: 'baseline',
    imports: `import { installNextJs } from '../lib/setup.js'`,
    setup: `await installNextJs(sandbox)`,
  },
  {
    suffix: 'agents-md',
    imports: `import { installNextJs, writeAgentsMd } from '../lib/setup.js'`,
    setup: `await installNextJs(sandbox)\n    await writeAgentsMd(sandbox)`,
  },
]

function pack() {
  fs.mkdirSync(TARBALL_DIR, { recursive: true })
  const out = execFileSync(
    'pnpm',
    ['pack', '--pack-destination', TARBALL_DIR],
    { cwd: path.join(ROOT, 'packages/next'), encoding: 'utf8' }
  )
  const produced = out.trim().split('\n').pop()
  const src = path.isAbsolute(produced)
    ? produced
    : path.join(TARBALL_DIR, produced)
  fs.renameSync(src, TARBALL)
}

/** @param {string | null} evalName  null means all evals */
function writeExperiments(evalName) {
  fs.rmSync(EXPERIMENTS_DIR, { recursive: true, force: true })
  fs.mkdirSync(EXPERIMENTS_DIR, { recursive: true })

  const evalsField = evalName ? `\n  evals: '${evalName}',` : ''
  for (const v of VARIANTS) {
    const body = `import type { ExperimentConfig } from '@vercel/agent-eval'
${v.imports}

const config: ExperimentConfig = {
  agent: 'claude-code',
  model: 'claude-opus-4-6',${evalsField}
  scripts: ['build'],
  runs: 1,
  earlyExit: true,
  timeout: 720,
  sandbox: 'auto',
  setup: async (sandbox) => {
    ${v.setup}
  },
}

export default config
`
    fs.writeFileSync(path.join(EXPERIMENTS_DIR, `${v.suffix}.ts`), body)
  }
}

function listEvals() {
  return fs
    .readdirSync(FIXTURES_DIR, { withFileTypes: true })
    .filter((d) => d.isDirectory())
    .map((d) => d.name)
}

function main() {
  const argv = require('yargs/yargs')(process.argv.slice(2))
    .command(
      '$0 [eval-name]',
      'Run an eval (baseline + agents-md variants)',
      (y) =>
        y.positional('eval-name', {
          type: 'string',
          describe: 'Fixture directory name',
        })
    )
    .boolean('all')
    .describe('all', 'Run every eval (slow — normally only CI does this)')
    .boolean('dry')
    .describe('dry', 'Preview without executing')
    .conflicts('all', 'eval-name')
    .check((argv) => {
      if (!argv.all && !argv.evalName) {
        throw new Error(
          `Missing <eval-name>.\n\nAvailable evals:\n${listEvals()
            .map((n) => `  ${n}`)
            .join('\n')}`
        )
      }
      if (
        argv.evalName &&
        !fs.existsSync(path.join(FIXTURES_DIR, argv.evalName))
      ) {
        throw new Error(
          `Unknown eval: ${argv.evalName}\n(looked in ${FIXTURES_DIR})`
        )
      }
      return true
    })
    .strict()
    .help().argv

  /** @type {string | null} */
  const evalName = argv.all ? null : /** @type {string} */ (argv.evalName)
  // Flags not consumed here are forwarded to agent-eval.
  const forward = argv.dry ? ['--dry'] : []

  if (!fs.existsSync(path.join(ROOT, 'packages/next/dist'))) {
    console.error(
      'packages/next/dist not found. Run `pnpm --filter=next build` first.'
    )
    process.exit(1)
  }

  if (process.env.NEXT_SKIP_PACK && fs.existsSync(TARBALL)) {
    console.log('> Reusing existing tarball (NEXT_SKIP_PACK=1)')
  } else {
    console.log('> Packing next...')
    pack()
    const mb = (fs.statSync(TARBALL).size / 1024 / 1024).toFixed(1)
    console.log(`  ${TARBALL} (${mb} MB)`)
  }

  // agent-eval loads .env / .env.local from its own cwd (evals/). `vc env pull`
  // writes to the repo root, so symlink them into evals/ for agent-eval to find.
  for (const envFile of ['.env', '.env.local']) {
    const src = path.join(ROOT, envFile)
    const dest = path.join(EVALS_DIR, envFile)
    try {
      // Remove stale symlink or file before creating a fresh one.
      fs.rmSync(dest, { force: true })
      if (fs.existsSync(src)) {
        fs.symlinkSync(src, dest)
      }
    } catch {}
  }

  writeExperiments(evalName)
  console.log(
    evalName
      ? `> Running ${evalName} (baseline + agents-md)`
      : '> Running all evals (baseline + agents-md)'
  )

  // Same handoff pattern as run-tests.js with NEXT_TEST_PKG_PATHS. We invoke
  // the bin directly rather than via `pnpm exec` because pnpm resets cwd to
  // the workspace root, but agent-eval resolves experiments/ from process.cwd().
  const bin = path.join(ROOT, 'node_modules/.bin/agent-eval')
  const result = spawnSync(bin, ['run-all', '--force', ...forward], {
    cwd: EVALS_DIR,
    stdio: 'inherit',
    env: { ...process.env, NEXT_EVAL_TARBALL: TARBALL },
  })
  if (result.error) {
    // ENOENT (missing bin), EACCES, etc. — spawnSync returns status: null
    // without printing anything, so surface it.
    console.error(`Failed to run ${bin}: ${result.error.message}`)
    if (/** @type {NodeJS.ErrnoException} */ (result.error).code === 'ENOENT') {
      console.error('Did you run `pnpm install`?')
    }
    process.exit(1)
  }
  process.exit(result.status ?? 1)
}

main()