From b4c3bcbc0417d3df5b22212969fcc7e318211823 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 25 Mar 2026 07:13:47 +0000 Subject: [PATCH 1/2] Initial plan From 5fd3b0e0cbac6f04f0ff0415a78580559e199316 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 25 Mar 2026 07:22:16 +0000 Subject: [PATCH 2/2] feat: support multiple compliance documents (PDFs, URLs, text files) for PR review Co-authored-by: JayPat2003 <87596889+JayPat2003@users.noreply.github.com> Agent-Logs-Url: https://github.com/JayPat2003/github-architect-code-reviewer/sessions/5b791906-a427-47fb-81ce-7446faa63057 --- README.md | 50 +++++++++++++++++++++++++---------- src/cli.py | 50 +++++++++++++++++++++++------------ src/doc_loader.py | 41 ++++++++++++++++++++++++----- src/reporter.py | 6 ++++- src/reviewer.py | 66 ++++++++++++++++++++++++++++++----------------- src/types.py | 10 ++++--- 6 files changed, 156 insertions(+), 67 deletions(-) diff --git a/README.md b/README.md index fea37f3..02af405 100644 --- a/README.md +++ b/README.md @@ -43,8 +43,8 @@ This tool automates the architecture review step using **GitHub Copilot AI**. When a developer opens a Pull Request, this tool: 1. **Reads** the code changes from GitHub automatically. -2. **Reads** your organisation's architecture document (PDF, web page, or text file). -3. **Sends both** to GitHub Copilot AI and asks it to check for violations. +2. **Reads** one or more of your organisation's compliance documents (PDFs, web pages, or text/Markdown files). +3. **Sends everything** to GitHub Copilot AI and asks it to check for violations across all documents. 4. **Produces** a structured report listing every issue found, with severity levels and suggested fixes. The result is an instant, consistent, and repeatable architecture review — every single time a Pull Request is raised. @@ -312,7 +312,7 @@ To generate a token: ## Running the Tool -### Basic command +### Basic command (single compliance document) ```bash python -m src.cli review \ @@ -322,7 +322,23 @@ python -m src.cli review \ --doc path/to/architecture.pdf ``` -### With a URL as the architecture document +### Multiple compliance documents + +Repeat `--doc` for each additional document — any combination of PDFs, URLs, and text/Markdown files is supported: + +```bash +python -m src.cli review \ + --owner your-org \ + --repo your-repo \ + --pr 42 \ + --doc docs/architecture-standards.pdf \ + --doc https://your-intranet.com/security-policy \ + --doc docs/company-guidelines.md +``` + +The AI will check the PR diff against **all** supplied documents and report any violation found in any of them. + +### With a URL as the compliance document ```bash python -m src.cli review \ @@ -355,13 +371,13 @@ python -m src.cli review --help The generated JSON report contains the following sections: -| Field | Description | -|------------------|----------------------------------------------------------| -| `meta` | PR details: owner, repo, number, title, and review date | -| `passed` | `true` if no errors were found, `false` otherwise | -| `summary` | A paragraph written by the AI summarising the review | -| `comments` | A list of specific issues found (see below) | -| `files_reviewed` | Every file that was part of the Pull Request | +| Field | Description | +|--------------------|---------------------------------------------------------------------| +| `meta` | PR details: owner, repo, number, title, review date, and the list of compliance documents used | +| `passed` | `true` if no errors were found, `false` otherwise | +| `summary` | A paragraph written by the AI summarising the review | +| `comments` | A list of specific issues found (see below) | +| `files_reviewed` | Every file that was part of the Pull Request | ### Comment severity levels @@ -380,7 +396,11 @@ The generated JSON report contains the following sections: "repo": "payment-service", "pr_number": 42, "pr_title": "Add Stripe payment integration", - "reviewed_at": "20240615_143022" + "reviewed_at": "20240615_143022", + "compliance_docs": [ + { "source": "docs/architecture-standards.pdf", "doc_type": "pdf" }, + { "source": "https://your-intranet.com/security-policy", "doc_type": "url" } + ] }, "passed": false, "summary": "The PR introduces a payment module but contains a hardcoded API key in config.py, which directly violates the organisation's secrets management policy.", @@ -439,9 +459,11 @@ jobs: --owner ${{ github.repository_owner }} \ --repo ${{ github.event.repository.name }} \ --pr ${{ github.event.pull_request.number }} \ - --doc docs/architecture-standards.pdf + --doc docs/architecture-standards.pdf \ + --doc docs/security-policy.md \ + --doc https://your-intranet.com/company-guidelines ``` -When this workflow is added to a repository, **every Pull Request will be automatically reviewed** against the architecture document. If violations are found, the check will fail and the merge will be blocked until the issues are resolved. +When this workflow is added to a repository, **every Pull Request will be automatically reviewed** against all configured compliance documents. If violations are found, the check will fail and the merge will be blocked until the issues are resolved. --- \ No newline at end of file diff --git a/src/cli.py b/src/cli.py index c714e1f..ac372ff 100644 --- a/src/cli.py +++ b/src/cli.py @@ -14,22 +14,31 @@ # 1. Create a .env file with your token: # GITHUB_TOKEN=ghp_... # - # 2. Basic usage: + # 2. Single compliance document: # python -m src.cli review \ # --owner \ # --repo \ # --pr \ # --doc path/to/architecture.pdf # - # 3. Override the output directory: + # 3. Multiple compliance documents (PDFs, URLs, text files): + # python -m src.cli review \ + # --owner \ + # --repo \ + # --pr \ + # --doc path/to/architecture.pdf \ + # --doc https://your-intranet.com/security-policy \ + # --doc docs/company-guidelines.md + # + # 4. Override the output directory: # python -m src.cli review ... --output ./my-reports # - # 4. Show help: + # 5. Show help: # python -m src.cli --help # python -m src.cli review --help Pipeline triggered by this file: - fetch_pull_request() → load_document() → run_review() → save_report() + fetch_pull_request() → load_documents() → run_review() → save_report() """ import os @@ -46,7 +55,7 @@ @click.group() -@click.version_option("1.0.0") +@click.version_option("1.1.0") def main() -> None: """AI-powered code review against architecture principles using GitHub Copilot.""" @@ -55,19 +64,24 @@ def main() -> None: @click.option("--owner", required=True, help="GitHub repository owner") @click.option("--repo", required=True, help="GitHub repository name") @click.option("--pr", "pr_number", required=True, type=int, help="Pull request number") -@click.option("--doc", "doc_path", required=True, help="Path or URL to the architecture document (PDF, URL, or text file)") +@click.option("--doc", "doc_paths", required=True, multiple=True, + help=( + "Path or URL to a compliance / architecture document " + "(PDF, URL, or text file). " + "Repeat this option to supply multiple documents." + )) @click.option("--output", "output_dir", default=os.getenv("REPORT_OUTPUT_DIR", "./reports"), show_default=True, help="Directory for the generated report") -def review(owner: str, repo: str, pr_number: int, doc_path: str, output_dir: str) -> None: +def review(owner: str, repo: str, pr_number: int, doc_paths: tuple, output_dir: str) -> None: """ - Review a pull request against an architecture document. + Review a pull request against one or more compliance documents. Steps performed: 1. Validate that GITHUB_TOKEN is present in the environment. 2. Fetch the PR diff from GitHub via github_client.py. - 3. Load and parse the architecture document via doc_loader.py. - 4. Send the diff + document to the Copilot API via reviewer.py. + 3. Load and parse every compliance document via doc_loader.py. + 4. Send the diff + all documents to the Copilot API via reviewer.py. 5. Persist the structured report to disk via reporter.py. 6. Exit 0 if no errors were found, exit 1 otherwise (CI-friendly). @@ -75,7 +89,8 @@ def review(owner: str, repo: str, pr_number: int, doc_path: str, output_dir: str owner : GitHub org or user who owns the repository. repo : Repository name. pr_number : PR number to review (--pr flag). - doc_path : Local file path or HTTPS URL of the architecture doc. + doc_paths : One or more local file paths or HTTPS URLs of compliance + documents. Supply --doc multiple times for multiple sources. output_dir : Directory where the report file will be written. """ # ── 1. Guard: token must exist before any network call ──────────────────── @@ -85,11 +100,12 @@ def review(owner: str, repo: str, pr_number: int, doc_path: str, output_dir: str sys.exit(1) console.print(f"[bold cyan]Reviewing PR #{pr_number}[/] in [green]{owner}/{repo}[/]") - console.print(f"Architecture doc: [yellow]{doc_path}[/]") + for i, doc in enumerate(doc_paths, start=1): + console.print(f"Compliance doc {i}: [yellow]{doc}[/]") # Lazy imports keep CLI startup fast (heavy deps load only when 'review' runs) from src.github_client import fetch_pull_request - from src.doc_loader import load_document + from src.doc_loader import load_documents from src.reviewer import run_review from src.reporter import save_report @@ -97,13 +113,13 @@ def review(owner: str, repo: str, pr_number: int, doc_path: str, output_dir: str with console.status("Fetching pull request…"): pr = fetch_pull_request(owner, repo, pr_number, github_token) - # ── 3. Load architecture document ───────────────────────────────────────── - with console.status("Loading architecture document…"): - arch_doc = load_document(doc_path) + # ── 3. Load compliance documents ────────────────────────────────────────── + with console.status(f"Loading {len(doc_paths)} compliance document(s)…"): + docs = load_documents(list(doc_paths)) # ── 4. Run Copilot review ───────────────────────────────────────────────── with console.status("Running Copilot review…"): - result = run_review(pr, arch_doc) + result = run_review(pr, docs) # ── 5. Save report ──────────────────────────────────────────────────────── Path(output_dir).mkdir(parents=True, exist_ok=True) diff --git a/src/doc_loader.py b/src/doc_loader.py index b600d5f..b71b384 100644 --- a/src/doc_loader.py +++ b/src/doc_loader.py @@ -2,20 +2,22 @@ doc_loader.py — Architecture document loader. Purpose: - Accepts a file path or URL pointing to an architecture document, - extracts its plain-text content, and returns an ArchitectureDoc object. - Supports three input types: + Accepts one or more file paths or URLs pointing to architecture / + compliance documents, extracts their plain-text content, and returns + the results as ArchitectureDoc objects. + Supports three input types per source: - 'pdf' : Local PDF file (parsed via pdfminer.six) - 'url' : HTTPS web page (scraped via requests + BeautifulSoup) - - 'text' : Plain .txt file (read directly) + - 'text' : Plain .txt / .md file (read directly) How it fits in the pipeline: - cli.py ──calls──> load_document() ──returns──> ArchitectureDoc + cli.py ──calls──> load_documents() ──returns──> List[ArchitectureDoc] """ import re from io import StringIO from pathlib import Path +from typing import List import requests from bs4 import BeautifulSoup @@ -46,7 +48,7 @@ def _load_url(url: str) -> str: def _load_text(path: str) -> str: - """Read a plain .txt file directly.""" + """Read a plain text or markdown file directly.""" return Path(path).read_text(encoding="utf-8").strip() @@ -60,7 +62,7 @@ def load_document(source: str) -> ArchitectureDoc: 3. Anything else → plain text loader. Args: - source: File path (PDF or .txt) or HTTPS URL. + source: File path (PDF, .txt, .md) or HTTPS URL. Returns: ArchitectureDoc with source, extracted content, and doc_type set. @@ -84,3 +86,28 @@ def load_document(source: str) -> ArchitectureDoc: doc_type = "text" return ArchitectureDoc(source=source, content=content, doc_type=doc_type) + + +def load_documents(sources: List[str]) -> List[ArchitectureDoc]: + """ + Load one or more compliance / architecture documents. + + Each source is processed independently via :func:`load_document`, so + any mix of PDFs, URLs, and plain-text files is valid. + + Args: + sources: Non-empty list of file paths or URLs. + + Returns: + List of ArchitectureDoc objects in the same order as *sources*. + + Raises: + ValueError: If *sources* is empty. + Any exception raised by individual document loaders is propagated + to the caller so that a single bad source does not silently produce + an empty document set. + """ + if not sources: + raise ValueError("At least one compliance document source must be provided.") + return [load_document(s) for s in sources] + diff --git a/src/reporter.py b/src/reporter.py index e9078f5..3cbdb49 100644 --- a/src/reporter.py +++ b/src/reporter.py @@ -58,6 +58,10 @@ def save_report(result: ReviewResult, output_dir: str) -> str: "pr_number": pr.number, "pr_title": pr.title, "reviewed_at": timestamp, + "compliance_docs": [ + {"source": doc.source, "doc_type": doc.doc_type} + for doc in result.compliance_docs + ], }, "passed": result.passed, "summary": result.summary, @@ -86,4 +90,4 @@ def save_report(result: ReviewResult, output_dir: str) -> str: report_path.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8") # 4. Return path as string for cli.py to display - return str(report_path) \ No newline at end of file + return str(report_path) diff --git a/src/reviewer.py b/src/reviewer.py index 4ff7cd4..031022f 100644 --- a/src/reviewer.py +++ b/src/reviewer.py @@ -2,15 +2,15 @@ reviewer.py — GitHub Copilot API integration for code review. Purpose: - Builds a structured prompt from the PR diff and architecture document, - sends it to the GitHub Copilot chat completions API via the openai SDK, - parses the JSON response into ReviewComment objects, and returns a - ReviewResult. + Builds a structured prompt from the PR diff and one or more compliance / + architecture documents, sends it to the GitHub Copilot chat completions + API via the openai SDK, parses the JSON response into ReviewComment + objects, and returns a ReviewResult. How it fits in the pipeline: - cli.py ──calls──> run_review(pr, arch_doc) ──returns──> ReviewResult - │ - reporter.py uses it + cli.py ──calls──> run_review(pr, docs) ──returns──> ReviewResult + │ + reporter.py uses it Environment variables required: GITHUB_TOKEN : Used as the Bearer token for Copilot API auth. @@ -27,18 +27,18 @@ # ── Prompt builder ──────────────────────────────────────────────────────────── -def _build_prompt(pr: PullRequest, arch_doc: ArchitectureDoc) -> str: +def _build_prompt(pr: PullRequest, docs: List[ArchitectureDoc]) -> str: """ Construct the user message sent to the Copilot model. Includes: - PR title and description - - Architecture document content + - All compliance / architecture document contents (clearly labelled) - Unified diffs of every changed file Args: - pr : Fetched PullRequest object. - arch_doc: Loaded ArchitectureDoc object. + pr : Fetched PullRequest object. + docs: One or more loaded ArchitectureDoc objects. Returns: A single formatted string ready to send as the user message. @@ -53,10 +53,19 @@ def _build_prompt(pr: PullRequest, arch_doc: ArchitectureDoc) -> str: diffs = "\n\n".join(diff_sections) - return f"""You are a senior software architect reviewing a Pull Request for compliance with the project's architecture principles. + # Build a clearly labelled section for every compliance document + doc_sections = [] + for i, doc in enumerate(docs, start=1): + doc_sections.append( + f"### Compliance Document {i}: {doc.source} [{doc.doc_type}]\n" + f"{doc.content}" + ) + docs_block = "\n\n".join(doc_sections) + + return f"""You are a senior software architect reviewing a Pull Request for compliance with the project's architecture principles and company policies. -## Architecture Document -{arch_doc.content} +## Compliance & Architecture Documents +{docs_block} ## Pull Request **Title:** {pr.title} @@ -66,7 +75,7 @@ def _build_prompt(pr: PullRequest, arch_doc: ArchitectureDoc) -> str: {diffs} ## Your Task -Review every changed file against the architecture document. +Review every changed file against ALL of the compliance and architecture documents listed above. Return a JSON object with this exact shape: {{ "summary": "", @@ -87,7 +96,7 @@ def _build_prompt(pr: PullRequest, arch_doc: ArchitectureDoc) -> str: # ── Response parser ─────────────────────────────────────────────────────────── -def _parse_response(raw: str, pr: PullRequest) -> ReviewResult: +def _parse_response(raw: str, pr: PullRequest, docs: List[ArchitectureDoc]) -> ReviewResult: """ Parse the raw JSON string returned by the Copilot model. @@ -95,8 +104,9 @@ def _parse_response(raw: str, pr: PullRequest) -> ReviewResult: so the pipeline never crashes due to a malformed LLM response. Args: - raw: Raw string content from the model's message. - pr : The original PullRequest (attached to the result). + raw : Raw string content from the model's message. + pr : The original PullRequest (attached to the result). + docs: The compliance documents used in this review. Returns: A fully-populated ReviewResult. @@ -118,6 +128,7 @@ def _parse_response(raw: str, pr: PullRequest) -> ReviewResult: comments=comments, summary=data.get("summary", ""), passed=data.get("passed", True), + compliance_docs=docs, ) except (json.JSONDecodeError, KeyError) as exc: # Graceful fallback — surface the raw response as an error comment @@ -134,23 +145,29 @@ def _parse_response(raw: str, pr: PullRequest) -> ReviewResult: ], summary="Review could not be parsed.", passed=False, + compliance_docs=docs, ) # ── Public entry point ──────────────────────────────────────────────────────── -def run_review(pr: PullRequest, arch_doc: ArchitectureDoc) -> ReviewResult: +def run_review(pr: PullRequest, docs: List[ArchitectureDoc]) -> ReviewResult: """ - Orchestrate the full Copilot review for one PR. + Orchestrate the full Copilot review for one PR against one or more + compliance / architecture documents. Uses GitHub Models API (models.inference.ai.azure.com) which accepts a GitHub PAT directly — no token exchange needed. Steps: 1. Initialise the OpenAI client pointed at GitHub Models API. - 2. Build the prompt from the PR diff and architecture document. + 2. Build the prompt from the PR diff and all compliance documents. 3. Send a chat completion request to gpt-4o. 4. Parse and return the structured ReviewResult. + + Args: + pr : The PullRequest to review. + docs: One or more ArchitectureDoc objects to check compliance against. """ # Explicitly remove any OPENAI_BASE_URL or OPENAI_API_KEY that could # redirect the client to the internal Copilot endpoint @@ -164,7 +181,7 @@ def run_review(pr: PullRequest, arch_doc: ArchitectureDoc) -> ReviewResult: ) # 2. Build prompt - prompt = _build_prompt(pr, arch_doc) + prompt = _build_prompt(pr, docs) # 3. Call the API response = client.chat.completions.create( @@ -174,7 +191,8 @@ def run_review(pr: PullRequest, arch_doc: ArchitectureDoc) -> ReviewResult: "role": "system", "content": ( "You are an expert software architect. " - "You review code changes for compliance with architecture principles. " + "You review code changes for compliance with architecture principles " + "and company policies. " "Always respond with valid JSON only." ), }, @@ -187,4 +205,4 @@ def run_review(pr: PullRequest, arch_doc: ArchitectureDoc) -> ReviewResult: raw = response.choices[0].message.content or "" # 4. Parse and return - return _parse_response(raw, pr) + return _parse_response(raw, pr, docs) diff --git a/src/types.py b/src/types.py index e4e531e..e0324ff 100644 --- a/src/types.py +++ b/src/types.py @@ -115,12 +115,14 @@ class ReviewResult: """ Complete output of one review run. Fields: - pr : The PullRequest reviewed. - comments: All ReviewComments raised. - summary : High-level summary paragraph. - passed : False if any 'error' severity comment exists. + pr : The PullRequest reviewed. + comments : All ReviewComments raised. + summary : High-level summary paragraph. + passed : False if any 'error' severity comment exists. + compliance_docs: Compliance documents used in this review. """ pr: PullRequest comments: List[ReviewComment] = field(default_factory=list) summary: str = "" passed: bool = True + compliance_docs: List[ArchitectureDoc] = field(default_factory=list)