Create-Inc · danielchen0 · May 17, 2026 · May 17, 2026 · May 17, 2026 · May 17, 2026
diff --git a/.prettierignore b/.prettierignore
@@ -1,3 +1,4 @@
 dist/
 node_modules/
 package-lock.json
+paper/eval/results/
diff --git a/knip.json b/knip.json
@@ -1,3 +1,3 @@
 {
-  "ignore": ["dist/**"]
+  "ignore": ["dist/**", "paper/eval/results/**"]
 }
diff --git a/package.json b/package.json
@@ -21,6 +21,8 @@
     "lint:fix": "eslint --fix . && prettier --write .",
     "format": "prettier --write .",
     "format:check": "prettier --check .",
+    "eval:prompt-grid": "npm run build && tsx scripts/run-prompt-grid-eval.ts",
+    "paper:stats": "tsx scripts/paper-stats.ts",
     "sync": "tsx scripts/sync.ts",
     "sync:check": "tsx scripts/sync.ts && git diff --exit-code -- src/rules/index.ts README.md"
   },

diff --git a/paper/.gitignore b/paper/.gitignore
@@ -0,0 +1,9 @@
+*.aux
+*.bbl
+*.blg
+*.fdb_latexmk
+*.fls
+*.log
+*.out
+*.pdf
+eval/results/
diff --git a/paper/Makefile b/paper/Makefile
@@ -0,0 +1,9 @@
+PDF=main.pdf
+
+.PHONY: all clean
+
+all:
+	latexmk -pdf -interaction=nonstopmode main.tex
+
+clean:
+	latexmk -C main.tex
diff --git a/paper/README.md b/paper/README.md
@@ -0,0 +1,100 @@
+# Laint Paper Draft
+
+This directory contains an initial arXiv-style paper draft for laint.
+
+## Current Shape
+
+The draft is intentionally framed as a research/tool paper, not a product announcement. The strongest publishable angle is:
+
+> Agent-oriented linting for generated JSX/TSX applications catches framework-specific web, mobile, and backend failures earlier than conventional build/type/runtime feedback.
+
+## Before Submission
+
+- Add real authors and affiliations.
+- Decide whether this targets arXiv only, a workshop, or both.
+- Run the prompt-to-code detector-quality evaluation described in `main.tex`.
+- Replace the evaluation-plan section with measured results.
+- Add citations to relevant program-repair and LLM-code-generation work.
+- Build the PDF from `main.tex` and inspect it before submission.
+
+## Version Pinning
+
+This draft pins its rule counts and preliminary benchmark results to `main` commit
+`6a60a0295955ee6cc1d639c88955ea50722e3516` from 2026-05-14.
+
+For future papers or follow-up benchmark runs, record:
+
+- The exact `main` commit or benchmark tag used for the laint rule corpus.
+- The prompt suite version.
+- The model IDs and provider versions used for generation.
+- The run date and output directory.
+
+A future tag scheme such as `benchmark/agent-oriented-linting-2026-05` or
+`paper/agent-oriented-linting-v1` would make these runs easier to cite without
+depending on floating branch names.
+
+## Reproducing Paper Numbers
+
+Every numeric claim in the draft should either be calculated from repository
+source or from a checked-in benchmark artifact.
+
+Rule corpus counts, severity counts, platform counts, and the category table are
+calculated from `src/rules/*` metadata:
+
+```bash
+npm run paper:stats
+```
+
+The preliminary prompt-grid numbers in `main.tex` are calculated from the
+archived run artifact at `paper/eval/artifacts/initial-grid/results.json`:
+
+```bash
+npm run paper:stats -- --eval paper/eval/artifacts/initial-grid/results.json
+```
+
+There is also a larger raw grid artifact at
+`paper/eval/artifacts/full-grid-2026-05-17/results.json`:
+
+```bash
+npm run paper:stats -- --eval paper/eval/artifacts/full-grid-2026-05-17/results.json
+```
+
+This raw run covers 6 prompts and 7 configured model aliases. Moonshot/Kimi failed
+all 6 generations due provider authentication/network errors, so use this
+artifact as raw evidence rather than final paper numbers until the Moonshot
+credential path is fixed or the reported model grid is explicitly scoped to the
+6 working model aliases.
+
+The generated app files under `paper/eval/results/` remain ignored because they
+are working outputs. If a benchmark run contributes numbers to a paper, archive
+the corresponding `results.json` under `paper/eval/artifacts/<run-name>/` or
+attach it to a tagged release before citing the numbers.
+
+## Suggested Evaluation Data
+
+- A prompt suite covering web, mobile, and backend app-building tasks.
+- Generated JSX/TSX outputs from one or more LLMs.
+- Laint findings for each generated output.
+- Human labels for whether each finding is a valid, invalid, or ambiguous violation.
+- Missed-defect labels for recall, when an independently reviewed corpus is available.
+- TypeScript, framework build, web preview, mobile simulator/device preview, and runtime outcomes.
+- Repair iteration counts after lint feedback.
+
+## Prompt Grid
+
+Run a small prompt-to-code grid with Doppler-provided model keys:
+
+```bash
+doppler run --project flux-worker --config dev -- npm run eval:prompt-grid
+```
+
+Useful options:
+
+```bash
+npm run eval:prompt-grid -- --limit 2
+npm run eval:prompt-grid -- --models openai-gpt-5.5,anthropic-sonnet-4.6,google-3.1-pro
+npm run eval:prompt-grid -- --out paper/eval/results/my-run
+```
+
+The runner writes raw generated files, `results.json`, `summary.md`, and `labels.todo.jsonl`
+under `paper/eval/results/`. That directory is intentionally ignored by git.
diff --git a/paper/eval/artifacts/full-grid-2026-05-17/results.json b/paper/eval/artifacts/full-grid-2026-05-17/results.json
diff --git a/paper/eval/artifacts/initial-grid/results.json b/paper/eval/artifacts/initial-grid/results.json
diff --git a/paper/eval/prompts.json b/paper/eval/prompts.json
@@ -0,0 +1,44 @@
+[
+  {
+    "id": "taskflow-web",
+    "platform": "web",
+    "source": "refactor-bench",
+    "description": "React task management component with CRUD, search, filtering, modals, themes, and persistent UI preferences.",
+    "outputFile": "app/page.tsx"
+  },
+  {
+    "id": "chat-web",
+    "platform": "web",
+    "source": "refactor-bench",
+    "description": "Realtime chat application page with auth gate, message history, typing indicators, local draft persistence, and theme switching.",
+    "outputFile": "app/page.tsx"
+  },
+  {
+    "id": "event-planner-mobile",
+    "platform": "expo",
+    "source": "refactor-bench",
+    "description": "React Native event planning app screen with event browsing, RSVP management, calendar view, category filtering, search, location display, attendee lists, event creation modal, notifications, and user profiles.",
+    "outputFile": "src/screens/HomeScreen.tsx"
+  },
+  {
+    "id": "beauty-shop-mobile",
+    "platform": "expo",
+    "source": "refactor-bench",
+    "description": "Beauty and cosmetics shopping mobile app screen with wishlist, brand discovery, product categories, search, profile access, and bottom-tab navigation.",
+    "outputFile": "src/app/(tabs)/shop.tsx"
+  },
+  {
+    "id": "wallet-api-backend",
+    "platform": "backend",
+    "source": "custom",
+    "description": "Next.js route handler for wallet transfers with request validation, balance lookup, transaction creation, retry handling, and JSON responses.",
+    "outputFile": "app/api/wallet/transfer/route.ts"
+  },
+  {
+    "id": "insurance-reports-backend",
+    "platform": "backend",
+    "source": "refactor-bench",
+    "description": "Next.js route handler for insurance report aggregation with role-based access checks, filters, conversion-rate calculations, CSV export support, and error logging.",
+    "outputFile": "app/api/reports/route.ts"
+  }
+]