fix(task): return empty string for unknown/empty Slurm states instead of defaulting to running
mapSlurmStateToTaskStatus previously defaulted to 'running' for empty state arrays and unrecognized states. This was too aggressive — treating unknown as actively running could cause incorrect status updates when Slurm returns unexpected or empty state data. Now empty/unknown states return an empty string, and refreshTaskStatus skips the update in that case.
This commit is contained in:
@@ -497,7 +497,7 @@ func uniqueInt64s(ids []int64) []int64 {
|
||||
|
||||
func (s *TaskService) mapSlurmStateToTaskStatus(slurmState []string) string {
|
||||
if len(slurmState) == 0 {
|
||||
return model.TaskStatusRunning
|
||||
return ""
|
||||
}
|
||||
|
||||
state := strings.ToUpper(slurmState[0])
|
||||
@@ -511,7 +511,8 @@ func (s *TaskService) mapSlurmStateToTaskStatus(slurmState []string) string {
|
||||
case "FAILED", "CANCELLED", "TIMEOUT", "NODE_FAIL", "OUT_OF_MEMORY", "PREEMPTED":
|
||||
return model.TaskStatusFailed
|
||||
default:
|
||||
return model.TaskStatusRunning
|
||||
s.logger.Warn("unrecognized slurm state, skipping update", zap.String("state", state))
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
@@ -542,15 +543,16 @@ func (s *TaskService) refreshTaskStatus(ctx context.Context, taskID int64) error
|
||||
}
|
||||
|
||||
newStatus := s.mapSlurmStateToTaskStatus(jobResp.State)
|
||||
if newStatus != task.Status {
|
||||
s.logger.Info("updating task status from slurm",
|
||||
zap.Int64("task_id", taskID),
|
||||
zap.String("old_status", task.Status),
|
||||
zap.String("new_status", newStatus),
|
||||
)
|
||||
return s.taskStore.UpdateStatus(ctx, taskID, newStatus, "")
|
||||
if newStatus == "" || newStatus == task.Status {
|
||||
return nil
|
||||
}
|
||||
return nil
|
||||
|
||||
s.logger.Info("updating task status from slurm",
|
||||
zap.Int64("task_id", taskID),
|
||||
zap.String("old_status", task.Status),
|
||||
zap.String("new_status", newStatus),
|
||||
)
|
||||
return s.taskStore.UpdateStatus(ctx, taskID, newStatus, "")
|
||||
}
|
||||
|
||||
func (s *TaskService) RefreshStaleTasks(ctx context.Context) error {
|
||||
|
||||
@@ -97,7 +97,7 @@ func TestTaskService_MapSlurmState_AllStates(t *testing.T) {
|
||||
{[]string{"OUT_OF_MEMORY"}, model.TaskStatusFailed},
|
||||
{[]string{"PREEMPTED"}, model.TaskStatusFailed},
|
||||
{[]string{"SPECIAL_EXIT"}, model.TaskStatusRunning},
|
||||
{[]string{"unknown_state"}, model.TaskStatusRunning},
|
||||
{[]string{"unknown_state"}, ""},
|
||||
{[]string{"pending"}, model.TaskStatusQueued},
|
||||
{[]string{"Running"}, model.TaskStatusRunning},
|
||||
}
|
||||
@@ -115,13 +115,13 @@ func TestTaskService_MapSlurmState_Empty(t *testing.T) {
|
||||
defer env.close()
|
||||
|
||||
got := env.svc.mapSlurmStateToTaskStatus([]string{})
|
||||
if got != model.TaskStatusRunning {
|
||||
t.Errorf("mapSlurmStateToTaskStatus([]) = %q, want %q", got, model.TaskStatusRunning)
|
||||
if got != "" {
|
||||
t.Errorf("mapSlurmStateToTaskStatus([]) = %q, want empty string", got)
|
||||
}
|
||||
|
||||
got = env.svc.mapSlurmStateToTaskStatus(nil)
|
||||
if got != model.TaskStatusRunning {
|
||||
t.Errorf("mapSlurmStateToTaskStatus(nil) = %q, want %q", got, model.TaskStatusRunning)
|
||||
if got != "" {
|
||||
t.Errorf("mapSlurmStateToTaskStatus(nil) = %q, want empty string", got)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user