fix(task): return empty string for unknown/empty Slurm states instead of defaulting to running
mapSlurmStateToTaskStatus previously defaulted to 'running' for empty state arrays and unrecognized states. This was too aggressive — treating unknown as actively running could cause incorrect status updates when Slurm returns unexpected or empty state data. Now empty/unknown states return an empty string, and refreshTaskStatus skips the update in that case.
This commit is contained in:
@@ -497,7 +497,7 @@ func uniqueInt64s(ids []int64) []int64 {
|
|||||||
|
|
||||||
func (s *TaskService) mapSlurmStateToTaskStatus(slurmState []string) string {
|
func (s *TaskService) mapSlurmStateToTaskStatus(slurmState []string) string {
|
||||||
if len(slurmState) == 0 {
|
if len(slurmState) == 0 {
|
||||||
return model.TaskStatusRunning
|
return ""
|
||||||
}
|
}
|
||||||
|
|
||||||
state := strings.ToUpper(slurmState[0])
|
state := strings.ToUpper(slurmState[0])
|
||||||
@@ -511,7 +511,8 @@ func (s *TaskService) mapSlurmStateToTaskStatus(slurmState []string) string {
|
|||||||
case "FAILED", "CANCELLED", "TIMEOUT", "NODE_FAIL", "OUT_OF_MEMORY", "PREEMPTED":
|
case "FAILED", "CANCELLED", "TIMEOUT", "NODE_FAIL", "OUT_OF_MEMORY", "PREEMPTED":
|
||||||
return model.TaskStatusFailed
|
return model.TaskStatusFailed
|
||||||
default:
|
default:
|
||||||
return model.TaskStatusRunning
|
s.logger.Warn("unrecognized slurm state, skipping update", zap.String("state", state))
|
||||||
|
return ""
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -542,15 +543,16 @@ func (s *TaskService) refreshTaskStatus(ctx context.Context, taskID int64) error
|
|||||||
}
|
}
|
||||||
|
|
||||||
newStatus := s.mapSlurmStateToTaskStatus(jobResp.State)
|
newStatus := s.mapSlurmStateToTaskStatus(jobResp.State)
|
||||||
if newStatus != task.Status {
|
if newStatus == "" || newStatus == task.Status {
|
||||||
s.logger.Info("updating task status from slurm",
|
return nil
|
||||||
zap.Int64("task_id", taskID),
|
|
||||||
zap.String("old_status", task.Status),
|
|
||||||
zap.String("new_status", newStatus),
|
|
||||||
)
|
|
||||||
return s.taskStore.UpdateStatus(ctx, taskID, newStatus, "")
|
|
||||||
}
|
}
|
||||||
return nil
|
|
||||||
|
s.logger.Info("updating task status from slurm",
|
||||||
|
zap.Int64("task_id", taskID),
|
||||||
|
zap.String("old_status", task.Status),
|
||||||
|
zap.String("new_status", newStatus),
|
||||||
|
)
|
||||||
|
return s.taskStore.UpdateStatus(ctx, taskID, newStatus, "")
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *TaskService) RefreshStaleTasks(ctx context.Context) error {
|
func (s *TaskService) RefreshStaleTasks(ctx context.Context) error {
|
||||||
|
|||||||
@@ -97,7 +97,7 @@ func TestTaskService_MapSlurmState_AllStates(t *testing.T) {
|
|||||||
{[]string{"OUT_OF_MEMORY"}, model.TaskStatusFailed},
|
{[]string{"OUT_OF_MEMORY"}, model.TaskStatusFailed},
|
||||||
{[]string{"PREEMPTED"}, model.TaskStatusFailed},
|
{[]string{"PREEMPTED"}, model.TaskStatusFailed},
|
||||||
{[]string{"SPECIAL_EXIT"}, model.TaskStatusRunning},
|
{[]string{"SPECIAL_EXIT"}, model.TaskStatusRunning},
|
||||||
{[]string{"unknown_state"}, model.TaskStatusRunning},
|
{[]string{"unknown_state"}, ""},
|
||||||
{[]string{"pending"}, model.TaskStatusQueued},
|
{[]string{"pending"}, model.TaskStatusQueued},
|
||||||
{[]string{"Running"}, model.TaskStatusRunning},
|
{[]string{"Running"}, model.TaskStatusRunning},
|
||||||
}
|
}
|
||||||
@@ -115,13 +115,13 @@ func TestTaskService_MapSlurmState_Empty(t *testing.T) {
|
|||||||
defer env.close()
|
defer env.close()
|
||||||
|
|
||||||
got := env.svc.mapSlurmStateToTaskStatus([]string{})
|
got := env.svc.mapSlurmStateToTaskStatus([]string{})
|
||||||
if got != model.TaskStatusRunning {
|
if got != "" {
|
||||||
t.Errorf("mapSlurmStateToTaskStatus([]) = %q, want %q", got, model.TaskStatusRunning)
|
t.Errorf("mapSlurmStateToTaskStatus([]) = %q, want empty string", got)
|
||||||
}
|
}
|
||||||
|
|
||||||
got = env.svc.mapSlurmStateToTaskStatus(nil)
|
got = env.svc.mapSlurmStateToTaskStatus(nil)
|
||||||
if got != model.TaskStatusRunning {
|
if got != "" {
|
||||||
t.Errorf("mapSlurmStateToTaskStatus(nil) = %q, want %q", got, model.TaskStatusRunning)
|
t.Errorf("mapSlurmStateToTaskStatus(nil) = %q, want empty string", got)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user