diff --git a/scientific-image-metadata-provenance-guard/README.md b/scientific-image-metadata-provenance-guard/README.md new file mode 100644 index 00000000..5680a9bb --- /dev/null +++ b/scientific-image-metadata-provenance-guard/README.md @@ -0,0 +1,42 @@ +# Scientific Image Metadata Provenance Guard + +This module adds a dependency-free provenance guard for scientific image +records before they are published into the Scientific Knowledge Graph. It is +focused on image nodes and safe graph edges, not duplicate-panel detection or +manuscript image-integrity review. + +## What It Checks + +- Source artifact, dataset, and protocol linkage for each image node. +- SHA-256 checksum format before KG publication. +- Acquisition date versus publication date chronology. +- Channel and pixel-size metadata for microscopy and fluorescence images. +- License and access state before public graph release. +- Private identifier leakage in filenames, metadata, and privacy tags. +- Derived figure provenance back to source images or datasets. + +## Outputs + +The guard emits: + +- A JSON report with per-record `publish`, `review`, or `block` decisions. +- Safe graph edges suitable for KG ingestion or reviewer inspection. +- Redaction actions for private or high-risk fields. +- A Markdown reviewer summary. +- A compact SVG decision chart for PR review and demos. + +## Run + +```bash +python3 scientific-image-metadata-provenance-guard/image_metadata_provenance_guard.py \ + --sample \ + --json scientific-image-metadata-provenance-guard/demo/report.json \ + --markdown scientific-image-metadata-provenance-guard/demo/summary.md \ + --svg scientific-image-metadata-provenance-guard/demo/graph.svg +``` + +## Test + +```bash +python3 -m unittest scientific-image-metadata-provenance-guard/test_image_metadata_provenance_guard.py +``` diff --git a/scientific-image-metadata-provenance-guard/demo/graph.svg b/scientific-image-metadata-provenance-guard/demo/graph.svg new file mode 100644 index 00000000..04c353d8 --- /dev/null +++ b/scientific-image-metadata-provenance-guard/demo/graph.svg @@ -0,0 +1,7 @@ + + + Image Metadata KG Guard + Publication decisions for scientific image nodes + publish1review2block2 + Safe edges: 22 | Redactions: 4 + diff --git a/scientific-image-metadata-provenance-guard/demo/report.json b/scientific-image-metadata-provenance-guard/demo/report.json new file mode 100644 index 00000000..a2bfe3cb --- /dev/null +++ b/scientific-image-metadata-provenance-guard/demo/report.json @@ -0,0 +1,316 @@ +{ + "guard": "scientific-image-metadata-provenance-guard", + "records": [ + { + "decision": "publish", + "findings": [], + "image_id": "IMG-CELL-001", + "redactions": [], + "release_scope": "public_kg", + "safe_edges": [ + { + "evidence": "metadata", + "object": "dataset:DS-ASTROCYTE-ATLAS-2026", + "predicate": "member_of_dataset", + "subject": "image:IMG-CELL-001", + "visibility": "public" + }, + { + "evidence": "metadata", + "object": "protocol:PR-IMMUNO-STAIN-V2", + "predicate": "uses_protocol", + "subject": "image:IMG-CELL-001", + "visibility": "public" + }, + { + "evidence": "metadata", + "object": "instrument:INST-CONFOCAL-A1", + "predicate": "captured_by", + "subject": "image:IMG-CELL-001", + "visibility": "public" + }, + { + "evidence": "metadata", + "object": "artifact:s3://scibase-lab/raw/astrocyte/IMG-CELL-001.ome.tiff", + "predicate": "has_source_artifact", + "subject": "image:IMG-CELL-001", + "visibility": "public" + }, + { + "evidence": "metadata", + "object": "license:CC-BY-4.0", + "predicate": "released_under", + "subject": "image:IMG-CELL-001", + "visibility": "public" + } + ], + "title": "Confocal astrocyte culture panel" + }, + { + "decision": "review", + "findings": [ + { + "code": "MISSING_CHANNELS", + "field": "channels", + "message": "Microscopy image should include channel metadata before KG recommendation use.", + "severity": "review" + }, + { + "code": "MISSING_PIXEL_SIZE", + "field": "pixel_size_um", + "message": "Microscopy image should include pixel_size_um for scale-aware graph navigation.", + "severity": "review" + } + ], + "image_id": "IMG-SLIDE-002", + "redactions": [], + "release_scope": "review_queue", + "safe_edges": [ + { + "evidence": "metadata", + "object": "dataset:DS-EMBRYO-STAIN-2026", + "predicate": "member_of_dataset", + "subject": "image:IMG-SLIDE-002", + "visibility": "internal" + }, + { + "evidence": "metadata", + "object": "protocol:PR-EMBRYO-STAIN-V1", + "predicate": "uses_protocol", + "subject": "image:IMG-SLIDE-002", + "visibility": "internal" + }, + { + "evidence": "metadata", + "object": "instrument:INST-WIDEFIELD-7", + "predicate": "captured_by", + "subject": "image:IMG-SLIDE-002", + "visibility": "internal" + }, + { + "evidence": "metadata", + "object": "artifact:s3://scibase-lab/raw/embryo/IMG-SLIDE-002.tiff", + "predicate": "has_source_artifact", + "subject": "image:IMG-SLIDE-002", + "visibility": "internal" + }, + { + "evidence": "metadata", + "object": "license:CC-BY-4.0", + "predicate": "released_under", + "subject": "image:IMG-SLIDE-002", + "visibility": "internal" + } + ], + "title": "Widefield embryo slide" + }, + { + "decision": "block", + "findings": [ + { + "code": "ACQUIRED_AFTER_PUBLICATION", + "field": "acquired_at", + "message": "Image acquisition date is later than the publication date.", + "severity": "block" + }, + { + "code": "NON_PUBLIC_ACCESS", + "field": "access", + "message": "Image is not public and should be kept out of the public KG release.", + "severity": "review" + }, + { + "code": "LICENSE_REVIEW_REQUIRED", + "field": "license", + "message": "Image license is missing or not clearly open for graph publication.", + "severity": "review" + }, + { + "code": "PRIVATE_TAG_PRESENT", + "field": "privacy_tags", + "message": "Privacy tags indicate private identifiers that must not enter public KG nodes.", + "severity": "block" + }, + { + "code": "PRIVATE_VALUE_IN_FILENAME", + "field": "filename", + "message": "Filename appears to contain private identifiers.", + "severity": "block" + }, + { + "code": "PRIVATE_METADATA_FIELD", + "field": "metadata.mrn", + "message": "Metadata contains a private field name or value.", + "severity": "block" + }, + { + "code": "PRIVATE_METADATA_FIELD", + "field": "metadata.patient_name", + "message": "Metadata contains a private field name or value.", + "severity": "block" + } + ], + "image_id": "IMG-PATH-003", + "redactions": [ + { + "action": "remove from public graph payload", + "field": "privacy_tags" + }, + { + "action": "replace with stable image_id", + "field": "filename" + }, + { + "action": "drop before KG release", + "field": "metadata.mrn" + }, + { + "action": "drop before KG release", + "field": "metadata.patient_name" + } + ], + "release_scope": "do_not_publish", + "safe_edges": [ + { + "evidence": "metadata", + "object": "dataset:DS-PATHOLOGY-RESTRICTED", + "predicate": "member_of_dataset", + "subject": "image:IMG-PATH-003", + "visibility": "internal" + }, + { + "evidence": "metadata", + "object": "protocol:PR-HISTOLOGY-REVIEW", + "predicate": "uses_protocol", + "subject": "image:IMG-PATH-003", + "visibility": "internal" + }, + { + "evidence": "metadata", + "object": "instrument:INST-SLIDE-SCANNER-2", + "predicate": "captured_by", + "subject": "image:IMG-PATH-003", + "visibility": "internal" + }, + { + "evidence": "metadata", + "object": "artifact:s3://restricted-lab/raw/pathology/IMG-PATH-003.svs", + "predicate": "has_source_artifact", + "subject": "image:IMG-PATH-003", + "visibility": "internal" + }, + { + "evidence": "metadata", + "object": "license:restricted", + "predicate": "released_under", + "subject": "image:IMG-PATH-003", + "visibility": "internal" + } + ], + "title": "Clinical pathology field with leaked identifiers" + }, + { + "decision": "block", + "findings": [ + { + "code": "MISSING_DERIVED_SOURCE", + "field": "derived_from", + "message": "Derived figure cannot enter the KG without source image or artifact lineage.", + "severity": "block" + } + ], + "image_id": "FIG-DERIVED-004", + "redactions": [], + "release_scope": "do_not_publish", + "safe_edges": [ + { + "evidence": "metadata", + "object": "dataset:DS-CELL-MIGRATION-2026", + "predicate": "member_of_dataset", + "subject": "image:FIG-DERIVED-004", + "visibility": "internal" + }, + { + "evidence": "metadata", + "object": "protocol:PR-FIGURE-ASSEMBLY", + "predicate": "uses_protocol", + "subject": "image:FIG-DERIVED-004", + "visibility": "internal" + }, + { + "evidence": "metadata", + "object": "license:CC-BY-4.0", + "predicate": "released_under", + "subject": "image:FIG-DERIVED-004", + "visibility": "internal" + } + ], + "title": "Composite figure without source chain" + }, + { + "decision": "review", + "findings": [ + { + "code": "MISSING_CHANNELS", + "field": "channels", + "message": "Microscopy image should include channel metadata before KG recommendation use.", + "severity": "review" + }, + { + "code": "NON_PUBLIC_ACCESS", + "field": "access", + "message": "Image is not public and should be kept out of the public KG release.", + "severity": "review" + }, + { + "code": "LICENSE_REVIEW_REQUIRED", + "field": "license", + "message": "Image license is missing or not clearly open for graph publication.", + "severity": "review" + } + ], + "image_id": "IMG-EMBARGO-005", + "redactions": [], + "release_scope": "internal_review_only", + "safe_edges": [ + { + "evidence": "metadata", + "object": "dataset:DS-CRYOEM-EMBARGO-2026", + "predicate": "member_of_dataset", + "subject": "image:IMG-EMBARGO-005", + "visibility": "internal" + }, + { + "evidence": "metadata", + "object": "protocol:PR-CRYOEM-CAPTURE-V4", + "predicate": "uses_protocol", + "subject": "image:IMG-EMBARGO-005", + "visibility": "internal" + }, + { + "evidence": "metadata", + "object": "instrument:INST-CRYOEM-4", + "predicate": "captured_by", + "subject": "image:IMG-EMBARGO-005", + "visibility": "internal" + }, + { + "evidence": "metadata", + "object": "artifact:s3://scibase-lab/embargo/cryo/IMG-EMBARGO-005.mrc", + "predicate": "has_source_artifact", + "subject": "image:IMG-EMBARGO-005", + "visibility": "internal" + } + ], + "title": "Embargoed cryo-EM map preview" + } + ], + "summary": { + "block": 2, + "publish": 1, + "redaction_count": 4, + "review": 2, + "safe_edge_count": 22, + "total": 5 + } +} diff --git a/scientific-image-metadata-provenance-guard/demo/summary.md b/scientific-image-metadata-provenance-guard/demo/summary.md new file mode 100644 index 00000000..3d568a8c --- /dev/null +++ b/scientific-image-metadata-provenance-guard/demo/summary.md @@ -0,0 +1,43 @@ +# Scientific Image Metadata Provenance Guard Summary + +- Total records: 5 +- Publish: 1 +- Review: 2 +- Block: 2 +- Safe graph edges: 22 +- Redactions: 4 + +| Image | Decision | Release scope | Findings | +| --- | --- | --- | --- | +| IMG-CELL-001 | publish | public_kg | none | +| IMG-SLIDE-002 | review | review_queue | MISSING_CHANNELS, MISSING_PIXEL_SIZE | +| IMG-PATH-003 | block | do_not_publish | ACQUIRED_AFTER_PUBLICATION, NON_PUBLIC_ACCESS, LICENSE_REVIEW_REQUIRED, PRIVATE_TAG_PRESENT, PRIVATE_VALUE_IN_FILENAME, PRIVATE_METADATA_FIELD, PRIVATE_METADATA_FIELD | +| FIG-DERIVED-004 | block | do_not_publish | MISSING_DERIVED_SOURCE | +| IMG-EMBARGO-005 | review | internal_review_only | MISSING_CHANNELS, NON_PUBLIC_ACCESS, LICENSE_REVIEW_REQUIRED | + +## Reviewer Actions + +### IMG-SLIDE-002 +- REVIEW MISSING_CHANNELS: Microscopy image should include channel metadata before KG recommendation use. +- REVIEW MISSING_PIXEL_SIZE: Microscopy image should include pixel_size_um for scale-aware graph navigation. + +### IMG-PATH-003 +- BLOCK ACQUIRED_AFTER_PUBLICATION: Image acquisition date is later than the publication date. +- REVIEW NON_PUBLIC_ACCESS: Image is not public and should be kept out of the public KG release. +- REVIEW LICENSE_REVIEW_REQUIRED: Image license is missing or not clearly open for graph publication. +- BLOCK PRIVATE_TAG_PRESENT: Privacy tags indicate private identifiers that must not enter public KG nodes. +- BLOCK PRIVATE_VALUE_IN_FILENAME: Filename appears to contain private identifiers. +- BLOCK PRIVATE_METADATA_FIELD: Metadata contains a private field name or value. +- BLOCK PRIVATE_METADATA_FIELD: Metadata contains a private field name or value. +- Redact `privacy_tags`: remove from public graph payload +- Redact `filename`: replace with stable image_id +- Redact `metadata.mrn`: drop before KG release +- Redact `metadata.patient_name`: drop before KG release + +### FIG-DERIVED-004 +- BLOCK MISSING_DERIVED_SOURCE: Derived figure cannot enter the KG without source image or artifact lineage. + +### IMG-EMBARGO-005 +- REVIEW MISSING_CHANNELS: Microscopy image should include channel metadata before KG recommendation use. +- REVIEW NON_PUBLIC_ACCESS: Image is not public and should be kept out of the public KG release. +- REVIEW LICENSE_REVIEW_REQUIRED: Image license is missing or not clearly open for graph publication. diff --git a/scientific-image-metadata-provenance-guard/image_metadata_provenance_guard.py b/scientific-image-metadata-provenance-guard/image_metadata_provenance_guard.py new file mode 100644 index 00000000..6d652b5d --- /dev/null +++ b/scientific-image-metadata-provenance-guard/image_metadata_provenance_guard.py @@ -0,0 +1,515 @@ +#!/usr/bin/env python3 +"""Guard scientific image metadata before Knowledge Graph publication.""" + +from __future__ import annotations + +import argparse +import datetime as dt +import html +import json +import re +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Iterable + + +CHECKSUM_RE = re.compile(r"^[0-9a-fA-F]{64}$") +PRIVATE_FIELD_RE = re.compile( + r"(patient|mrn|medical_record|dob|date_of_birth|email|phone|ssn|address)", + re.IGNORECASE, +) +PRIVATE_VALUE_RE = re.compile( + r"(MRN[_ -]?\d{4,}|\b\d{3}-\d{2}-\d{4}\b|[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,})", + re.IGNORECASE, +) +MICROSCOPY_MODALITIES = { + "confocal_microscopy", + "fluorescence_microscopy", + "widefield_microscopy", + "electron_microscopy", + "microscopy", +} +NON_PUBLIC_ACCESS = {"private", "restricted", "embargoed", "confidential"} +OPEN_LICENSE_PREFIXES = ("CC-", "MIT", "BSD", "Apache", "ODC-") + + +@dataclass(frozen=True) +class Finding: + code: str + severity: str + message: str + field: str | None = None + + def to_dict(self) -> dict[str, Any]: + data = { + "code": self.code, + "severity": self.severity, + "message": self.message, + } + if self.field: + data["field"] = self.field + return data + + +@dataclass(frozen=True) +class GraphEdge: + subject: str + predicate: str + object: str + visibility: str = "public" + evidence: str = "metadata" + + def to_dict(self) -> dict[str, str]: + return { + "subject": self.subject, + "predicate": self.predicate, + "object": self.object, + "visibility": self.visibility, + "evidence": self.evidence, + } + + +@dataclass +class GuardResult: + image_id: str + title: str + decision: str + release_scope: str + findings: list[Finding] = field(default_factory=list) + redactions: list[dict[str, str]] = field(default_factory=list) + safe_edges: list[GraphEdge] = field(default_factory=list) + + def to_dict(self) -> dict[str, Any]: + return { + "image_id": self.image_id, + "title": self.title, + "decision": self.decision, + "release_scope": self.release_scope, + "findings": [finding.to_dict() for finding in self.findings], + "redactions": self.redactions, + "safe_edges": [edge.to_dict() for edge in self.safe_edges], + } + + +def _parse_date(value: Any) -> dt.date | None: + if not value: + return None + if isinstance(value, dt.date): + return value + try: + return dt.date.fromisoformat(str(value)[:10]) + except ValueError: + return None + + +def _as_text(value: Any) -> str: + if value is None: + return "" + if isinstance(value, (dict, list)): + return json.dumps(value, sort_keys=True) + return str(value) + + +def _contains_private_value(value: Any) -> bool: + text = _as_text(value) + return bool(PRIVATE_VALUE_RE.search(text)) + + +def _field_has_private_name(field_name: str) -> bool: + return bool(PRIVATE_FIELD_RE.search(field_name)) + + +def _open_license(license_value: str) -> bool: + normalized = license_value.strip() + return bool(normalized) and normalized.startswith(OPEN_LICENSE_PREFIXES) + + +def evaluate_record(record: dict[str, Any]) -> GuardResult: + """Evaluate one image metadata record and return a KG publication decision.""" + + image_id = str(record.get("image_id") or "UNKNOWN") + title = str(record.get("title") or image_id) + modality = str(record.get("modality") or "").strip().lower() + image_kind = str(record.get("image_kind") or "raw_image").strip().lower() + source_artifact = str(record.get("source_artifact") or "").strip() + dataset_id = str(record.get("dataset_id") or "").strip() + protocol_id = str(record.get("protocol_id") or "").strip() + instrument_id = str(record.get("instrument_id") or "").strip() + access = str(record.get("access") or "public").strip().lower() + license_value = str(record.get("license") or "").strip() + filename = str(record.get("filename") or "").strip() + channels = record.get("channels") or [] + pixel_size = record.get("pixel_size_um") + derived_from = record.get("derived_from") or [] + metadata = record.get("metadata") or {} + privacy_tags = [str(tag).lower() for tag in record.get("privacy_tags") or []] + findings: list[Finding] = [] + redactions: list[dict[str, str]] = [] + + if not image_id or image_id == "UNKNOWN": + findings.append( + Finding("MISSING_IMAGE_ID", "block", "Image node is missing a stable image_id.", "image_id") + ) + + checksum = str(record.get("sha256") or "").strip() + if not CHECKSUM_RE.match(checksum): + findings.append( + Finding( + "INVALID_CHECKSUM", + "block", + "Image checksum must be a 64-character SHA-256 hex digest.", + "sha256", + ) + ) + + if not dataset_id: + findings.append( + Finding("MISSING_DATASET", "review", "Image has no dataset node for KG membership.", "dataset_id") + ) + + if not protocol_id: + findings.append( + Finding("MISSING_PROTOCOL", "review", "Image has no protocol node for provenance.", "protocol_id") + ) + + if image_kind == "derived_figure": + if not derived_from and not source_artifact: + findings.append( + Finding( + "MISSING_DERIVED_SOURCE", + "block", + "Derived figure cannot enter the KG without source image or artifact lineage.", + "derived_from", + ) + ) + elif not source_artifact: + findings.append( + Finding( + "MISSING_SOURCE_ARTIFACT", + "review", + "Raw image has no source artifact location for provenance audit.", + "source_artifact", + ) + ) + + acquired_at = _parse_date(record.get("acquired_at")) + publication_date = _parse_date(record.get("publication_date")) + if acquired_at is None: + findings.append( + Finding("MISSING_ACQUISITION_DATE", "review", "Image has no parseable acquisition date.", "acquired_at") + ) + if publication_date is None: + findings.append( + Finding("MISSING_PUBLICATION_DATE", "review", "Image has no parseable publication date.", "publication_date") + ) + if acquired_at and publication_date and acquired_at > publication_date: + findings.append( + Finding( + "ACQUIRED_AFTER_PUBLICATION", + "block", + "Image acquisition date is later than the publication date.", + "acquired_at", + ) + ) + + if modality in MICROSCOPY_MODALITIES: + if not isinstance(channels, list) or not channels: + findings.append( + Finding( + "MISSING_CHANNELS", + "review", + "Microscopy image should include channel metadata before KG recommendation use.", + "channels", + ) + ) + if pixel_size in (None, ""): + findings.append( + Finding( + "MISSING_PIXEL_SIZE", + "review", + "Microscopy image should include pixel_size_um for scale-aware graph navigation.", + "pixel_size_um", + ) + ) + elif not isinstance(pixel_size, (int, float)) or pixel_size <= 0: + findings.append( + Finding( + "INVALID_PIXEL_SIZE", + "review", + "pixel_size_um must be a positive number.", + "pixel_size_um", + ) + ) + + if access in NON_PUBLIC_ACCESS: + findings.append( + Finding( + "NON_PUBLIC_ACCESS", + "review", + "Image is not public and should be kept out of the public KG release.", + "access", + ) + ) + + if not _open_license(license_value): + findings.append( + Finding( + "LICENSE_REVIEW_REQUIRED", + "review", + "Image license is missing or not clearly open for graph publication.", + "license", + ) + ) + + if any(tag in {"phi", "pii", "private_identifier", "patient_identifier"} for tag in privacy_tags): + findings.append( + Finding( + "PRIVATE_TAG_PRESENT", + "block", + "Privacy tags indicate private identifiers that must not enter public KG nodes.", + "privacy_tags", + ) + ) + redactions.append({"field": "privacy_tags", "action": "remove from public graph payload"}) + + if _contains_private_value(filename): + findings.append( + Finding( + "PRIVATE_VALUE_IN_FILENAME", + "block", + "Filename appears to contain private identifiers.", + "filename", + ) + ) + redactions.append({"field": "filename", "action": "replace with stable image_id"}) + + if isinstance(metadata, dict): + for key, value in sorted(metadata.items()): + if _field_has_private_name(str(key)) or _contains_private_value(value): + findings.append( + Finding( + "PRIVATE_METADATA_FIELD", + "block", + "Metadata contains a private field name or value.", + f"metadata.{key}", + ) + ) + redactions.append({"field": f"metadata.{key}", "action": "drop before KG release"}) + + decision = _decision_from_findings(findings) + release_scope = _release_scope(decision, access) + safe_edges = _build_safe_edges( + image_id=image_id, + dataset_id=dataset_id, + protocol_id=protocol_id, + instrument_id=instrument_id, + source_artifact=source_artifact, + license_value=license_value, + derived_from=derived_from, + release_scope=release_scope, + allow_public=decision == "publish", + ) + + return GuardResult( + image_id=image_id, + title=title, + decision=decision, + release_scope=release_scope, + findings=findings, + redactions=redactions, + safe_edges=safe_edges, + ) + + +def _decision_from_findings(findings: Iterable[Finding]) -> str: + severities = {finding.severity for finding in findings} + if "block" in severities: + return "block" + if "review" in severities: + return "review" + return "publish" + + +def _release_scope(decision: str, access: str) -> str: + if decision == "block": + return "do_not_publish" + if access in NON_PUBLIC_ACCESS: + return "internal_review_only" + if decision == "review": + return "review_queue" + return "public_kg" + + +def _build_safe_edges( + *, + image_id: str, + dataset_id: str, + protocol_id: str, + instrument_id: str, + source_artifact: str, + license_value: str, + derived_from: list[Any], + release_scope: str, + allow_public: bool, +) -> list[GraphEdge]: + visibility = "public" if allow_public and release_scope == "public_kg" else "internal" + image_node = f"image:{image_id}" + edges: list[GraphEdge] = [] + + if dataset_id: + edges.append(GraphEdge(image_node, "member_of_dataset", f"dataset:{dataset_id}", visibility)) + if protocol_id: + edges.append(GraphEdge(image_node, "uses_protocol", f"protocol:{protocol_id}", visibility)) + if instrument_id: + edges.append(GraphEdge(image_node, "captured_by", f"instrument:{instrument_id}", visibility)) + if source_artifact: + edges.append(GraphEdge(image_node, "has_source_artifact", f"artifact:{source_artifact}", visibility)) + if license_value: + edges.append(GraphEdge(image_node, "released_under", f"license:{license_value}", visibility)) + for source in derived_from: + source_text = str(source).strip() + if source_text: + edges.append(GraphEdge(image_node, "derived_from", f"image:{source_text}", visibility)) + return edges + + +def evaluate_records(records: Iterable[dict[str, Any]]) -> dict[str, Any]: + results = [evaluate_record(record) for record in records] + summary = { + "total": len(results), + "publish": sum(1 for result in results if result.decision == "publish"), + "review": sum(1 for result in results if result.decision == "review"), + "block": sum(1 for result in results if result.decision == "block"), + "safe_edge_count": sum(len(result.safe_edges) for result in results), + "redaction_count": sum(len(result.redactions) for result in results), + } + return { + "guard": "scientific-image-metadata-provenance-guard", + "summary": summary, + "records": [result.to_dict() for result in results], + } + + +def load_records(path: Path) -> list[dict[str, Any]]: + with path.open("r", encoding="utf-8") as handle: + records = json.load(handle) + if not isinstance(records, list): + raise ValueError("Input JSON must contain a list of image metadata records.") + return records + + +def sample_path() -> Path: + return Path(__file__).with_name("sample_image_records.json") + + +def write_json(report: dict[str, Any], path: Path) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(report, indent=2, sort_keys=True) + "\n", encoding="utf-8") + + +def write_markdown(report: dict[str, Any], path: Path) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + summary = report["summary"] + lines = [ + "# Scientific Image Metadata Provenance Guard Summary", + "", + f"- Total records: {summary['total']}", + f"- Publish: {summary['publish']}", + f"- Review: {summary['review']}", + f"- Block: {summary['block']}", + f"- Safe graph edges: {summary['safe_edge_count']}", + f"- Redactions: {summary['redaction_count']}", + "", + "| Image | Decision | Release scope | Findings |", + "| --- | --- | --- | --- |", + ] + for record in report["records"]: + findings = ", ".join(finding["code"] for finding in record["findings"]) or "none" + lines.append( + "| {image} | {decision} | {scope} | {findings} |".format( + image=record["image_id"], + decision=record["decision"], + scope=record["release_scope"], + findings=findings, + ) + ) + lines.extend(["", "## Reviewer Actions", ""]) + for record in report["records"]: + if record["decision"] == "publish": + continue + lines.append(f"### {record['image_id']}") + for finding in record["findings"]: + lines.append(f"- {finding['severity'].upper()} {finding['code']}: {finding['message']}") + for redaction in record["redactions"]: + lines.append(f"- Redact `{redaction['field']}`: {redaction['action']}") + lines.append("") + path.write_text("\n".join(lines).rstrip() + "\n", encoding="utf-8") + + +def write_svg(report: dict[str, Any], path: Path) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + summary = report["summary"] + bars = [ + ("publish", "#2f855a", summary["publish"]), + ("review", "#b7791f", summary["review"]), + ("block", "#c53030", summary["block"]), + ] + max_count = max([count for _, _, count in bars] + [1]) + bar_markup = [] + for index, (label, color, count) in enumerate(bars): + y = 80 + index * 58 + width = int(300 * count / max_count) + bar_markup.append( + f'{html.escape(label)}' + ) + bar_markup.append( + f'' + ) + bar_markup.append( + f'{count}' + ) + svg = f""" + + Image Metadata KG Guard + Publication decisions for scientific image nodes + {''.join(bar_markup)} + Safe edges: {summary['safe_edge_count']} | Redactions: {summary['redaction_count']} + +""" + path.write_text(svg, encoding="utf-8") + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description=__doc__) + input_group = parser.add_mutually_exclusive_group(required=True) + input_group.add_argument("--input", type=Path, help="Path to image metadata JSON records.") + input_group.add_argument("--sample", action="store_true", help="Use the bundled sample image records.") + parser.add_argument("--json", type=Path, help="Write JSON report to this path.") + parser.add_argument("--markdown", type=Path, help="Write Markdown summary to this path.") + parser.add_argument("--svg", type=Path, help="Write SVG summary chart to this path.") + return parser + + +def main(argv: list[str] | None = None) -> int: + args = build_parser().parse_args(argv) + records = load_records(sample_path() if args.sample else args.input) + report = evaluate_records(records) + + if args.json: + write_json(report, args.json) + if args.markdown: + write_markdown(report, args.markdown) + if args.svg: + write_svg(report, args.svg) + + summary = report["summary"] + print( + "Image metadata KG guard: " + f"{summary['publish']} publish, {summary['review']} review, " + f"{summary['block']} block, {summary['safe_edge_count']} safe edges." + ) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scientific-image-metadata-provenance-guard/sample_image_records.json b/scientific-image-metadata-provenance-guard/sample_image_records.json new file mode 100644 index 00000000..216e84b5 --- /dev/null +++ b/scientific-image-metadata-provenance-guard/sample_image_records.json @@ -0,0 +1,128 @@ +[ + { + "image_id": "IMG-CELL-001", + "title": "Confocal astrocyte culture panel", + "image_kind": "raw_image", + "modality": "confocal_microscopy", + "source_artifact": "s3://scibase-lab/raw/astrocyte/IMG-CELL-001.ome.tiff", + "dataset_id": "DS-ASTROCYTE-ATLAS-2026", + "protocol_id": "PR-IMMUNO-STAIN-V2", + "instrument_id": "INST-CONFOCAL-A1", + "sha256": "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef", + "acquired_at": "2026-02-11", + "publication_date": "2026-03-08", + "license": "CC-BY-4.0", + "access": "public", + "filename": "astrocyte_culture_a1_ome.tiff", + "privacy_tags": [], + "metadata": { + "operator": "lab-tech-17", + "objective": "63x oil", + "ome_model": "2016-06" + }, + "channels": [ + { + "name": "DAPI", + "wavelength_nm": 405 + }, + { + "name": "GFAP", + "wavelength_nm": 488 + } + ], + "pixel_size_um": 0.108 + }, + { + "image_id": "IMG-SLIDE-002", + "title": "Widefield embryo slide", + "image_kind": "raw_image", + "modality": "fluorescence_microscopy", + "source_artifact": "s3://scibase-lab/raw/embryo/IMG-SLIDE-002.tiff", + "dataset_id": "DS-EMBRYO-STAIN-2026", + "protocol_id": "PR-EMBRYO-STAIN-V1", + "instrument_id": "INST-WIDEFIELD-7", + "sha256": "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "acquired_at": "2026-01-15", + "publication_date": "2026-03-01", + "license": "CC-BY-4.0", + "access": "public", + "filename": "embryo_slide_002.tiff", + "privacy_tags": [], + "metadata": { + "operator": "lab-tech-22" + }, + "channels": [], + "pixel_size_um": null + }, + { + "image_id": "IMG-PATH-003", + "title": "Clinical pathology field with leaked identifiers", + "image_kind": "raw_image", + "modality": "pathology_slide", + "source_artifact": "s3://restricted-lab/raw/pathology/IMG-PATH-003.svs", + "dataset_id": "DS-PATHOLOGY-RESTRICTED", + "protocol_id": "PR-HISTOLOGY-REVIEW", + "instrument_id": "INST-SLIDE-SCANNER-2", + "sha256": "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb", + "acquired_at": "2026-06-12", + "publication_date": "2026-05-01", + "license": "restricted", + "access": "private", + "filename": "Jane_Doe_MRN_1938472_biopsy.svs", + "privacy_tags": [ + "phi" + ], + "metadata": { + "patient_name": "Jane Doe", + "mrn": "1938472", + "operator": "clinical-tech-3" + }, + "channels": [], + "pixel_size_um": 0.24 + }, + { + "image_id": "FIG-DERIVED-004", + "title": "Composite figure without source chain", + "image_kind": "derived_figure", + "modality": "composite_figure", + "source_artifact": "", + "dataset_id": "DS-CELL-MIGRATION-2026", + "protocol_id": "PR-FIGURE-ASSEMBLY", + "instrument_id": "", + "sha256": "cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc", + "acquired_at": "2026-02-05", + "publication_date": "2026-04-01", + "license": "CC-BY-4.0", + "access": "public", + "filename": "figure_2_panel_c.png", + "privacy_tags": [], + "metadata": { + "assembly_tool": "imagej" + }, + "channels": [], + "pixel_size_um": null, + "derived_from": [] + }, + { + "image_id": "IMG-EMBARGO-005", + "title": "Embargoed cryo-EM map preview", + "image_kind": "raw_image", + "modality": "electron_microscopy", + "source_artifact": "s3://scibase-lab/embargo/cryo/IMG-EMBARGO-005.mrc", + "dataset_id": "DS-CRYOEM-EMBARGO-2026", + "protocol_id": "PR-CRYOEM-CAPTURE-V4", + "instrument_id": "INST-CRYOEM-4", + "sha256": "dddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd", + "acquired_at": "2026-03-04", + "publication_date": "2026-07-01", + "license": "", + "access": "embargoed", + "filename": "cryoem_preview_005.mrc", + "privacy_tags": [], + "metadata": { + "magnification": "105000x" + }, + "channels": [], + "pixel_size_um": 1.08 + } +] diff --git a/scientific-image-metadata-provenance-guard/test_image_metadata_provenance_guard.py b/scientific-image-metadata-provenance-guard/test_image_metadata_provenance_guard.py new file mode 100644 index 00000000..0568a27c --- /dev/null +++ b/scientific-image-metadata-provenance-guard/test_image_metadata_provenance_guard.py @@ -0,0 +1,114 @@ +import importlib.util +import json +import subprocess +import sys +import tempfile +import unittest +from pathlib import Path + + +MODULE_PATH = Path(__file__).with_name("image_metadata_provenance_guard.py") +SPEC = importlib.util.spec_from_file_location("image_metadata_provenance_guard", MODULE_PATH) +guard = importlib.util.module_from_spec(SPEC) +sys.modules[SPEC.name] = guard +SPEC.loader.exec_module(guard) + + +class ImageMetadataProvenanceGuardTest(unittest.TestCase): + def test_sample_summary_counts(self): + report = guard.evaluate_records(guard.load_records(guard.sample_path())) + + self.assertEqual(report["summary"]["total"], 5) + self.assertEqual(report["summary"]["publish"], 1) + self.assertEqual(report["summary"]["review"], 2) + self.assertEqual(report["summary"]["block"], 2) + self.assertGreaterEqual(report["summary"]["safe_edge_count"], 12) + + def test_publish_record_emits_public_graph_edges(self): + report = guard.evaluate_records(guard.load_records(guard.sample_path())) + published = next(record for record in report["records"] if record["image_id"] == "IMG-CELL-001") + + self.assertEqual(published["decision"], "publish") + self.assertEqual(published["release_scope"], "public_kg") + predicates = {edge["predicate"] for edge in published["safe_edges"]} + self.assertIn("member_of_dataset", predicates) + self.assertIn("uses_protocol", predicates) + self.assertIn("captured_by", predicates) + self.assertTrue(all(edge["visibility"] == "public" for edge in published["safe_edges"])) + + def test_phi_and_chronology_block_public_release(self): + report = guard.evaluate_records(guard.load_records(guard.sample_path())) + blocked = next(record for record in report["records"] if record["image_id"] == "IMG-PATH-003") + codes = {finding["code"] for finding in blocked["findings"]} + + self.assertEqual(blocked["decision"], "block") + self.assertEqual(blocked["release_scope"], "do_not_publish") + self.assertIn("ACQUIRED_AFTER_PUBLICATION", codes) + self.assertIn("PRIVATE_TAG_PRESENT", codes) + self.assertIn("PRIVATE_VALUE_IN_FILENAME", codes) + self.assertIn("PRIVATE_METADATA_FIELD", codes) + self.assertGreaterEqual(len(blocked["redactions"]), 3) + + def test_microscopy_without_channel_metadata_requires_review(self): + report = guard.evaluate_records(guard.load_records(guard.sample_path())) + reviewed = next(record for record in report["records"] if record["image_id"] == "IMG-SLIDE-002") + codes = {finding["code"] for finding in reviewed["findings"]} + + self.assertEqual(reviewed["decision"], "review") + self.assertIn("MISSING_CHANNELS", codes) + self.assertIn("MISSING_PIXEL_SIZE", codes) + self.assertTrue(all(edge["visibility"] == "internal" for edge in reviewed["safe_edges"])) + + def test_derived_figure_without_source_is_blocked(self): + report = guard.evaluate_records(guard.load_records(guard.sample_path())) + blocked = next(record for record in report["records"] if record["image_id"] == "FIG-DERIVED-004") + codes = {finding["code"] for finding in blocked["findings"]} + + self.assertEqual(blocked["decision"], "block") + self.assertIn("MISSING_DERIVED_SOURCE", codes) + + def test_report_writers_create_reviewer_artifacts(self): + report = guard.evaluate_records(guard.load_records(guard.sample_path())) + + with tempfile.TemporaryDirectory() as tmpdir: + tmp_path = Path(tmpdir) + json_path = tmp_path / "report.json" + markdown_path = tmp_path / "summary.md" + svg_path = tmp_path / "graph.svg" + + guard.write_json(report, json_path) + guard.write_markdown(report, markdown_path) + guard.write_svg(report, svg_path) + + self.assertEqual(json.loads(json_path.read_text())["summary"]["total"], 5) + self.assertIn("IMG-PATH-003", markdown_path.read_text()) + self.assertIn("