fix(task): prevent duplicate Slurm job submission on backend restart
RecoverStuckTasks now skips tasks that already have a slurm_job_id, and ProcessTask adds a guard before the submitting step to prevent re-submission even if a task is incorrectly re-enqueued. Also deprecates POST /api/v1/jobs/submit endpoint (replaced by POST /tasks) and comments out related handlers and tests.
This commit is contained in:
@@ -263,7 +263,15 @@ func (s *TaskService) ProcessTask(ctx context.Context, taskID int64) error {
|
||||
}
|
||||
}
|
||||
|
||||
// 13-14. Set ready + submitting
|
||||
// 13-14. Set ready + submitting (guard: skip if already submitted to Slurm)
|
||||
if task.SlurmJobID != nil {
|
||||
s.logger.Info("task already has slurm job, skipping submission",
|
||||
zap.Int64("task_id", taskID),
|
||||
zap.Int32("slurm_job_id", *task.SlurmJobID),
|
||||
)
|
||||
return nil
|
||||
}
|
||||
|
||||
if err := s.taskStore.UpdateRetryState(ctx, taskID, model.TaskStatusReady, model.TaskStepSubmitting, 0); err != nil {
|
||||
return fail(model.TaskStepSubmitting, fmt.Sprintf("update status to ready: %v", err))
|
||||
}
|
||||
@@ -694,6 +702,13 @@ func (s *TaskService) RecoverStuckTasks(ctx context.Context) {
|
||||
return
|
||||
}
|
||||
for i := range tasks {
|
||||
if tasks[i].SlurmJobID != nil {
|
||||
s.logger.Info("skipping stuck task recovery, already in slurm",
|
||||
zap.Int64("taskID", tasks[i].ID),
|
||||
zap.Int32("slurm_job_id", *tasks[i].SlurmJobID),
|
||||
)
|
||||
continue
|
||||
}
|
||||
_ = s.taskStore.UpdateStatus(ctx, tasks[i].ID, model.TaskStatusSubmitted, "")
|
||||
s.mu.Lock()
|
||||
if !s.stopped {
|
||||
|
||||
Reference in New Issue
Block a user