feat(service): add GetJob fallback to SlurmDBD history and expand query params
GetJob now falls back to SlurmDBD history when active queue returns 404 or empty jobs. Expand JobHistoryQuery from 7 to 16 filter params (add SubmitTime, Cluster, Qos, Constraints, ExitCode, Node, Reservation, Groups, Wckey). Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
@@ -116,7 +116,8 @@ func (s *JobService) GetJobs(ctx context.Context) ([]model.JobResponse, error) {
|
||||
return jobs, nil
|
||||
}
|
||||
|
||||
// GetJob retrieves a single job by ID.
|
||||
// GetJob retrieves a single job by ID. If the job is not found in the active
|
||||
// queue (404 or empty result), it falls back to querying SlurmDBD history.
|
||||
func (s *JobService) GetJob(ctx context.Context, jobID string) (*model.JobResponse, error) {
|
||||
s.logger.Debug("slurm API request",
|
||||
zap.String("operation", "GetJob"),
|
||||
@@ -128,6 +129,12 @@ func (s *JobService) GetJob(ctx context.Context, jobID string) (*model.JobRespon
|
||||
took := time.Since(start)
|
||||
|
||||
if err != nil {
|
||||
if slurm.IsNotFound(err) {
|
||||
s.logger.Debug("job not in active queue, querying history",
|
||||
zap.String("job_id", jobID),
|
||||
)
|
||||
return s.getJobFromHistory(ctx, jobID)
|
||||
}
|
||||
s.logger.Debug("slurm API error response",
|
||||
zap.String("operation", "GetJob"),
|
||||
zap.String("job_id", jobID),
|
||||
@@ -146,13 +153,49 @@ func (s *JobService) GetJob(ctx context.Context, jobID string) (*model.JobRespon
|
||||
)
|
||||
|
||||
if len(result.Jobs) == 0 {
|
||||
return nil, nil
|
||||
s.logger.Debug("empty jobs response, querying history",
|
||||
zap.String("job_id", jobID),
|
||||
)
|
||||
return s.getJobFromHistory(ctx, jobID)
|
||||
}
|
||||
|
||||
resp := mapJobInfo(&result.Jobs[0])
|
||||
return &resp, nil
|
||||
}
|
||||
|
||||
func (s *JobService) getJobFromHistory(ctx context.Context, jobID string) (*model.JobResponse, error) {
|
||||
start := time.Now()
|
||||
result, _, err := s.client.SlurmdbJobs.GetJob(ctx, jobID)
|
||||
took := time.Since(start)
|
||||
|
||||
if err != nil {
|
||||
s.logger.Debug("slurmdb API error response",
|
||||
zap.String("operation", "getJobFromHistory"),
|
||||
zap.String("job_id", jobID),
|
||||
zap.Duration("took", took),
|
||||
zap.Error(err),
|
||||
)
|
||||
if slurm.IsNotFound(err) {
|
||||
return nil, nil
|
||||
}
|
||||
return nil, fmt.Errorf("get job history %s: %w", jobID, err)
|
||||
}
|
||||
|
||||
s.logger.Debug("slurmdb API response",
|
||||
zap.String("operation", "getJobFromHistory"),
|
||||
zap.String("job_id", jobID),
|
||||
zap.Duration("took", took),
|
||||
zap.Any("body", result),
|
||||
)
|
||||
|
||||
if len(result.Jobs) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
resp := mapSlurmdbJob(&result.Jobs[0])
|
||||
return &resp, nil
|
||||
}
|
||||
|
||||
// CancelJob cancels a job by ID.
|
||||
func (s *JobService) CancelJob(ctx context.Context, jobID string) error {
|
||||
s.logger.Debug("slurm API request",
|
||||
@@ -209,6 +252,33 @@ func (s *JobService) GetJobHistory(ctx context.Context, query *model.JobHistoryQ
|
||||
if query.EndTime != "" {
|
||||
opts.EndTime = strToPtr(query.EndTime)
|
||||
}
|
||||
if query.SubmitTime != "" {
|
||||
opts.SubmitTime = strToPtr(query.SubmitTime)
|
||||
}
|
||||
if query.Cluster != "" {
|
||||
opts.Cluster = strToPtr(query.Cluster)
|
||||
}
|
||||
if query.Qos != "" {
|
||||
opts.Qos = strToPtr(query.Qos)
|
||||
}
|
||||
if query.Constraints != "" {
|
||||
opts.Constraints = strToPtr(query.Constraints)
|
||||
}
|
||||
if query.ExitCode != "" {
|
||||
opts.ExitCode = strToPtr(query.ExitCode)
|
||||
}
|
||||
if query.Node != "" {
|
||||
opts.Node = strToPtr(query.Node)
|
||||
}
|
||||
if query.Reservation != "" {
|
||||
opts.Reservation = strToPtr(query.Reservation)
|
||||
}
|
||||
if query.Groups != "" {
|
||||
opts.Groups = strToPtr(query.Groups)
|
||||
}
|
||||
if query.Wckey != "" {
|
||||
opts.Wckey = strToPtr(query.Wckey)
|
||||
}
|
||||
|
||||
s.logger.Debug("slurm API request",
|
||||
zap.String("operation", "GetJobHistory"),
|
||||
|
||||
Reference in New Issue
Block a user