fix(task): prevent duplicate Slurm job submission on backend restart

RecoverStuckTasks now skips tasks that already have a slurm_job_id,
and ProcessTask adds a guard before the submitting step to prevent
re-submission even if a task is incorrectly re-enqueued.

Also deprecates POST /api/v1/jobs/submit endpoint (replaced by POST /tasks)
and comments out related handlers and tests.
This commit is contained in:
dailz
2026-04-21 10:57:38 +08:00
parent 4fd331ebd8
commit b90942de77
8 changed files with 61 additions and 35 deletions

View File

@@ -263,7 +263,15 @@ func (s *TaskService) ProcessTask(ctx context.Context, taskID int64) error {
}
}
// 13-14. Set ready + submitting
// 13-14. Set ready + submitting (guard: skip if already submitted to Slurm)
if task.SlurmJobID != nil {
s.logger.Info("task already has slurm job, skipping submission",
zap.Int64("task_id", taskID),
zap.Int32("slurm_job_id", *task.SlurmJobID),
)
return nil
}
if err := s.taskStore.UpdateRetryState(ctx, taskID, model.TaskStatusReady, model.TaskStepSubmitting, 0); err != nil {
return fail(model.TaskStepSubmitting, fmt.Sprintf("update status to ready: %v", err))
}
@@ -694,6 +702,13 @@ func (s *TaskService) RecoverStuckTasks(ctx context.Context) {
return
}
for i := range tasks {
if tasks[i].SlurmJobID != nil {
s.logger.Info("skipping stuck task recovery, already in slurm",
zap.Int64("taskID", tasks[i].ID),
zap.Int32("slurm_job_id", *tasks[i].SlurmJobID),
)
continue
}
_ = s.taskStore.UpdateStatus(ctx, tasks[i].ID, model.TaskStatusSubmitted, "")
s.mu.Lock()
if !s.stopped {