DiamondLightSource
diff --git a/‎setup.py‎
Lines changed: 1 addition & 1 deletion b/‎setup.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/dlstbx/services/trigger_xchem.py‎
Lines changed: 139 additions & 51 deletions b/‎src/dlstbx/services/trigger_xchem.py‎
Lines changed: 139 additions & 51 deletions
@@ -78,7 +78,6 @@
     "mr_predict = dlstbx.wrapper.mr_predict:MRPredictWrapper",
     "mrbump = dlstbx.wrapper.mrbump:MrBUMPWrapper",
     "pandda_xchem = dlstbx.wrapper.pandda_xchem:PanDDAWrapper",
-    "pandda_post = dlstbx.wrapper.pandda_post:PanDDApostWrapper",
     "pipedream_xchem = dlstbx.wrapper.pipedream_xchem:PipedreamWrapper",
     "phaser_ellg  = dlstbx.wrapper.phaser_ellg:PhasereLLGWrapper",
     "rlv = dlstbx.wrapper.rlv:RLVWrapper",
@@ -91,6 +90,7 @@
     "stepped_transmission = dlstbx.wrapper.stepped_transmission:SteppedTransmissionWrapper",
     # "timg = dlstbx.wrapper.timg:TopazWrapper",  # tentatively disabled
     "topaz3 = dlstbx.wrapper.topaz3_wrapper:Topaz3Wrapper",
+    "xchem_collate = dlstbx.wrapper.xchem_collate:XChemCollateWrapper",
     "xia2 = dlstbx.wrapper.xia2:Xia2Wrapper",
     "xia2.multiplex = dlstbx.wrapper.xia2_multiplex:Xia2MultiplexWrapper",
     "xia2.overload = dlstbx.wrapper.xia2_overload:Xia2OverloadWrapper",
 
@@ -53,24 +53,31 @@ def create_metrics(self):
 
 class PanDDAParameters(pydantic.BaseModel):
     dcid: int = pydantic.Field(gt=0)
-    comparator_threshold: int = pydantic.Field(default=300)
+    comparator_threshold: int = pydantic.Field(default=350)
     automatic: Optional[bool] = False
     comment: Optional[str] = None
     scaling_id: list[int]
-    timeout: float = pydantic.Field(default=120, alias="timeout-minutes")
-    backoff_delay: float = pydantic.Field(default=45, alias="backoff-delay")
-    backoff_max_try: int = pydantic.Field(default=30, alias="backoff-max-try")
-    backoff_multiplier: float = pydantic.Field(default=1.1, alias="backoff-multiplier")
+    timeout: float = pydantic.Field(default=180, alias="timeout-minutes")
+    backoff_delay: float = pydantic.Field(default=20, alias="backoff-delay")
+    backoff_max_try: int = pydantic.Field(default=10, alias="backoff-max-try")
+    backoff_multiplier: float = pydantic.Field(default=2, alias="backoff-multiplier")
     pipedream: Optional[bool] = True
+    reprocessing: Optional[bool] = False
 
 
-class PanDDA_PostParameters(pydantic.BaseModel):
+class XChemCollate_Parameters(pydantic.BaseModel):
     dcid: int = pydantic.Field(gt=0)
+    program_id: int = pydantic.Field(gt=0)
     automatic: Optional[bool] = False
     comment: Optional[str] = None
     scaling_id: list[int]
     processing_directory: str
     timeout: float = pydantic.Field(default=60, alias="timeout-minutes")
+    backoff_delay: float = pydantic.Field(default=20, alias="backoff-delay")
+    backoff_max_try: int = pydantic.Field(default=10, alias="backoff-max-try")
+    backoff_multiplier: float = pydantic.Field(default=2, alias="backoff-multiplier")
+    pipedream: Optional[bool] = False
+    reprocessing: Optional[bool] = False
 
 
 class DLSTriggerXChem(CommonService):
@@ -230,6 +237,7 @@ def trigger_pandda_xchem(
         scaling_id = parameters.scaling_id[0]
         comparator_threshold = parameters.comparator_threshold
         pipedream = parameters.pipedream
+        reprocessing = parameters.reprocessing
 
         protein_info = get_protein_for_dcid(parameters.dcid, session)
         # protein_id = getattr(protein_info, "proteinId")
@@ -405,7 +413,7 @@ def trigger_pandda_xchem(
             )
             return {"success": True}
 
-        # If other dimple/PanDDA2 job is running, quit, dimple set to trigger even if it fails
+        # If other dimple/PanDDA2 job is running, quit, dimple will trigger PanDDA2 even if it fails
         min_start_time = datetime.now() - timedelta(hours=6)
 
         query = (
@@ -432,27 +440,6 @@ def trigger_pandda_xchem(
             )
             return {"success": True}
 
-        # Stop-gap, interval > max checkpoint time
-        min_start_time = datetime.now() - timedelta(minutes=30)
-
-        query = (
-            (
-                session.query(AutoProcProgram, ProcessingJob.dataCollectionId).join(
-                    ProcessingJob,
-                    ProcessingJob.processingJobId == AutoProcProgram.processingJobId,
-                )
-            )
-            .filter(ProcessingJob.dataCollectionId == dcid)
-            .filter(AutoProcProgram.processingPrograms.in_(["PanDDA2"]))
-            .filter(AutoProcProgram.recordTimeStamp > min_start_time)
-        )
-
-        if triggered_processing_job := query.first():
-            self.log.info(
-                f"Exiting PanDDA2/Pipedream trigger: another PanDDA2 job was recently launched for dcid {dcid}"
-            )
-            return {"success": True}
-
         # Now check if other upstream pipeline is running and if so, checkpoint (it might fail)
         min_start_time = datetime.now() - timedelta(hours=6)
         query = (
@@ -550,15 +537,15 @@ def trigger_pandda_xchem(
 
         df = pd.read_sql(query.statement, query.session.bind)
 
-        # use datasets processed in user spacegroup if possible
+        # prioritise datasets processed in user-defined spacegroup
         if "user_sg" in locals():
             df_filteredbysg = df[df["spaceGroup"] == user_sg]
 
             if not df_filteredbysg.empty:
                 df = df_filteredbysg
                 n_success_upstream = len(df)
                 self.log.info(
-                    f"There are {n_success_upstream} successful upstream jobs (excl fast-dp) in the user defined spacegroup {user_sg} \
+                    f"There are {n_success_upstream} successful upstream jobs (excl fast-dp) in the user-defined spacegroup {user_sg} \
                     selecting the best one based on I/sigI*completeness * #unique reflections, from the most recent processing batch"
                 )
 
@@ -611,7 +598,7 @@ def trigger_pandda_xchem(
             )
             return {"success": True}
 
-        # mark a new batch whenever the gap is >= 12 hours
+        # mark as new batch whenever the gap between jobs is >= 12 hours, consider most recent batch
         df2 = df2.sort_values("processingStartTime").reset_index(drop=True)
         df2["time_diff"] = df2["processingStartTime"].diff()
         df2["batch"] = (df2["time_diff"] >= pd.Timedelta(hours=12)).cumsum() + 1
@@ -712,13 +699,15 @@ def trigger_pandda_xchem(
         compound_dir = dataset_dir / "compound"
 
         self.log.info(f"Creating directory {dataset_dir}")
-        try:
-            compound_dir.mkdir(parents=True, exist_ok=False)
-        except FileExistsError:
-            self.log.info(
-                f"Exiting PanDDA2/Pipedream trigger: {dataset_dir} already exists"
-            )
-            return {"success": True}
+
+        if not reprocessing:
+            try:
+                compound_dir.mkdir(parents=True, exist_ok=False)
+            except FileExistsError:
+                self.log.info(
+                    f"Exiting PanDDA2/Pipedream trigger: {dataset_dir} already exists"
+                )
+                return {"success": True}
 
         # Copy the dimple files of the selected dataset
         shutil.copy(pdb, str(dataset_dir / "dimple.pdb"))
@@ -756,6 +745,8 @@ def trigger_pandda_xchem(
             "database_path": str(db_master),
             "upstream_mtz": pathlib.Path(upstream_mtz).parts[-1],
             "smiles": str(CompoundSMILES),
+            "pipedream": pipedream,
+            "reprocessing": reprocessing,
         }
 
         dataset_list = sorted([p.parts[-1] for p in model_dir.iterdir() if p.is_dir()])
@@ -799,45 +790,50 @@ def trigger_pandda_xchem(
         return {"success": True}
 
     @pydantic.validate_call(config={"arbitrary_types_allowed": True})
-    def trigger_pandda_xchem_post(
+    def trigger_xchem_collate(
         self,
         rw: workflows.recipe.RecipeWrapper,
         *,
         message: Dict,
-        parameters: PanDDA_PostParameters,
+        parameters: XChemCollate_Parameters,
         session: sqlalchemy.orm.session.Session,
         transaction: int,
         **kwargs,
     ):
-        """Trigger a PanDDA post-run job for an XChem fragment screening experiment.
+        """Trigger an XChem Collate job for an XChem fragment screening experiment.
         Recipe parameters are described below with appropriate ispyb placeholder "{}"
         values:
-        - target: set this to "pandda_xchem_post"
+        - target: set this to "xchem_collate"
         - dcid: the dataCollectionId for the given data collection i.e. "{ispyb_dcid}"
         - comment: a comment to be stored in the ProcessingJob.comment field
         - timeout-minutes: (optional) the max time (in minutes) allowed to wait for
         processing PanDDA jobs
         - automatic: boolean value passed to ProcessingJob.automatic field
         Example recipe parameters:
-        { "target": "pandda_xchem_post",
+        { "target": "xchem_collate",
             "dcid": 123456,
             "scaling_id": [123456],
             "processing_directory": '/dls/labxchem/data/lb42888/lb42888-1/processing',
             "automatic": true,
-            "comment": "PanDDA2 post-run",
         }
         """
 
         dcid = parameters.dcid
+        program_id = parameters.program_id
         scaling_id = parameters.scaling_id[0]
         processing_directory = pathlib.Path(parameters.processing_directory)
+        # reprocessing = parameters.reprocessing
+        pipedream = parameters.pipedream
 
         _, ispyb_info = dlstbx.ispybtbx.ispyb_filter({}, {"ispyb_dcid": dcid}, session)
         visit = ispyb_info.get("ispyb_visit", "")
         visit_proposal = visit.split("-")[0]
         visit_number = visit.split("-")[1]
 
-        # From proposal and visit get all dcids
+        protein_info = get_protein_for_dcid(parameters.dcid, session)
+        acronym = getattr(protein_info, "acronym")
+
+        # get all dcids for the visit
         query = (
             session.query(Proposal, BLSession, DataCollection)
             .join(BLSession, BLSession.proposalId == Proposal.proposalId)
@@ -850,6 +846,27 @@ def trigger_pandda_xchem_post(
         df = pd.read_sql(query.statement, query.session.bind)
         dcids = df["dataCollectionId"].tolist()
 
+        # trigger on the final PanDDA/Pipedream program_id for the visit
+        query = (
+            (
+                session.query(AutoProcProgram, ProcessingJob.dataCollectionId).join(
+                    ProcessingJob,
+                    ProcessingJob.processingJobId == AutoProcProgram.processingJobId,
+                )
+            )
+            .filter(ProcessingJob.dataCollectionId.in_(dcids))
+            .filter(AutoProcProgram.processingPrograms.in_(["PanDDA2", "Pipedream"]))
+            .filter(AutoProcProgram.autoProcProgramId > program_id)  # noqa E711
+        )
+
+        if triggered_processing_job := query.first():
+            self.log.info(
+                f"Aborting xchem_collate trigger for dcid {dcid} as processing job has been started for dcid {triggered_processing_job.dataCollectionId}"
+            )
+            return {"success": True}
+
+        # has processing finished for the current visit? checkpoint if not
+        min_start_time = datetime.now() - timedelta(hours=8)
         query = (
             (
                 session.query(AutoProcProgram, ProcessingJob.dataCollectionId).join(
@@ -858,7 +875,77 @@ def trigger_pandda_xchem_post(
                 )
             )
             .filter(ProcessingJob.dataCollectionId.in_(dcids))
-            .filter(AutoProcProgram.processingPrograms == "PanDDA2-post")
+            .filter(
+                AutoProcProgram.processingPrograms.in_(
+                    ["xia2 dials", "PanDDA2", "Pipedream"]
+                )
+            )
+            .filter(
+                or_(
+                    AutoProcProgram.processingStatus == None,  # noqa E711
+                    AutoProcProgram.processingStartTime == None,  # noqa E711
+                )
+            )
+            .filter(AutoProcProgram.recordTimeStamp > min_start_time)  # noqa E711
+        )
+
+        # Calculate message delay for backoff when max program id has finished but there remains
+        # some jobs to finish in which case we checkpoint with the calculated message delay
+        status = {
+            "ntry": 0,
+        }
+        if isinstance(message, dict):
+            status.update(message.get("trigger-status", {}))
+        message_delay = int(
+            parameters.backoff_delay * parameters.backoff_multiplier ** status["ntry"]
+        )
+        status["ntry"] += 1
+        self.log.debug(f"dcid={dcid}\nmessage_delay={message_delay}\n{status}")
+
+        # If there are any running (or yet to start) jobs, then checkpoint with delay
+        waiting_processing_jobs = query.all()
+        if n_waiting_processing_jobs := len(waiting_processing_jobs):
+            self.log.info(
+                f"Waiting on {n_waiting_processing_jobs} processing jobs for {dcid=} for XChemCollate"
+            )
+            waiting_appids = [
+                row.AutoProcProgram.autoProcProgramId for row in waiting_processing_jobs
+            ]
+            if status["ntry"] >= parameters.backoff_max_try:
+                # Give up waiting for this program to finish and trigger
+                # collate with remaining results that are available
+                self.log.info(
+                    f"Max-try exceeded, giving up waiting for related processings for appids {waiting_appids}\n"
+                )
+            else:
+                # Send results to myself for next round of processing
+                self.log.debug(f"Waiting for appids={waiting_appids}")
+                rw.checkpoint(
+                    {
+                        "trigger-status": status,
+                    },
+                    delay=message_delay,
+                    transaction=transaction,
+                )
+
+                return {"success": True}
+
+        self.log.debug(
+            f"PanDDA2/Pipedream processing has finished for {visit_proposal}-{visit_number}"
+        )
+
+        # Stop-gap
+        min_start_time = datetime.now() - timedelta(minutes=180)
+
+        query = (
+            session.query(AutoProcProgram, ProcessingJob.dataCollectionId)
+            .join(
+                ProcessingJob,
+                ProcessingJob.processingJobId == AutoProcProgram.processingJobId,
+            )
+            .filter(ProcessingJob.dataCollectionId.in_(dcids))
+            .filter(AutoProcProgram.processingPrograms.in_(["XChemCollate"]))
+            .filter(AutoProcProgram.recordTimeStamp > min_start_time)
             .filter(
                 or_(
                     AutoProcProgram.processingStatus == None,  # noqa E711
@@ -869,18 +956,19 @@ def trigger_pandda_xchem_post(
 
         if triggered_processing_job := query.first():
             self.log.info(
-                f"Aborting PanDDA2_postrun trigger as another postrun job has started for dcid {triggered_processing_job.dataCollectionId}"
+                f"Exiting XChemCollate trigger: another XChemCollate job has been launched for {acronym}"
             )
             return {"success": True}
 
-        self.log.debug("PanDDA2 postrun trigger: Starting")
+        self.log.debug("XChemCollate trigger: Starting")
 
         recipe_parameters = {
-            "dcid": dcid,  #
+            "dcid": max(dcids),
             "processing_directory": str(processing_directory),
             "scaling_id": scaling_id,
+            "pipedream": pipedream,
         }
-
-        self.upsert_proc(rw, dcid, "PanDDA2-post", recipe_parameters)
+        # Upsert on max dcid
+        self.upsert_proc(rw, max(dcids), "XChem-Collate", recipe_parameters)
 
         return {"success": True}