feat(service): add GetJob fallback to SlurmDBD history and expand query params
GetJob now falls back to SlurmDBD history when active queue returns 404 or empty jobs. Expand JobHistoryQuery from 7 to 16 filter params (add SubmitTime, Cluster, Qos, Constraints, ExitCode, Node, Reservation, Groups, Wckey). Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
@@ -68,13 +68,22 @@ type JobListResponse struct {
|
|||||||
|
|
||||||
// JobHistoryQuery contains query parameters for job history.
|
// JobHistoryQuery contains query parameters for job history.
|
||||||
type JobHistoryQuery struct {
|
type JobHistoryQuery struct {
|
||||||
Users string `form:"users" json:"users,omitempty"`
|
Users string `form:"users" json:"users,omitempty"`
|
||||||
StartTime string `form:"start_time" json:"start_time,omitempty"`
|
StartTime string `form:"start_time" json:"start_time,omitempty"`
|
||||||
EndTime string `form:"end_time" json:"end_time,omitempty"`
|
EndTime string `form:"end_time" json:"end_time,omitempty"`
|
||||||
Account string `form:"account" json:"account,omitempty"`
|
SubmitTime string `form:"submit_time" json:"submit_time,omitempty"`
|
||||||
Partition string `form:"partition" json:"partition,omitempty"`
|
Account string `form:"account" json:"account,omitempty"`
|
||||||
State string `form:"state" json:"state,omitempty"`
|
Partition string `form:"partition" json:"partition,omitempty"`
|
||||||
JobName string `form:"job_name" json:"job_name,omitempty"`
|
State string `form:"state" json:"state,omitempty"`
|
||||||
Page int `form:"page,default=1" json:"page,omitempty"`
|
JobName string `form:"job_name" json:"job_name,omitempty"`
|
||||||
PageSize int `form:"page_size,default=20" json:"page_size,omitempty"`
|
Cluster string `form:"cluster" json:"cluster,omitempty"`
|
||||||
|
Qos string `form:"qos" json:"qos,omitempty"`
|
||||||
|
Constraints string `form:"constraints" json:"constraints,omitempty"`
|
||||||
|
ExitCode string `form:"exit_code" json:"exit_code,omitempty"`
|
||||||
|
Node string `form:"node" json:"node,omitempty"`
|
||||||
|
Reservation string `form:"reservation" json:"reservation,omitempty"`
|
||||||
|
Groups string `form:"groups" json:"groups,omitempty"`
|
||||||
|
Wckey string `form:"wckey" json:"wckey,omitempty"`
|
||||||
|
Page int `form:"page,default=1" json:"page,omitempty"`
|
||||||
|
PageSize int `form:"page_size,default=20" json:"page_size,omitempty"`
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -116,7 +116,8 @@ func (s *JobService) GetJobs(ctx context.Context) ([]model.JobResponse, error) {
|
|||||||
return jobs, nil
|
return jobs, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetJob retrieves a single job by ID.
|
// GetJob retrieves a single job by ID. If the job is not found in the active
|
||||||
|
// queue (404 or empty result), it falls back to querying SlurmDBD history.
|
||||||
func (s *JobService) GetJob(ctx context.Context, jobID string) (*model.JobResponse, error) {
|
func (s *JobService) GetJob(ctx context.Context, jobID string) (*model.JobResponse, error) {
|
||||||
s.logger.Debug("slurm API request",
|
s.logger.Debug("slurm API request",
|
||||||
zap.String("operation", "GetJob"),
|
zap.String("operation", "GetJob"),
|
||||||
@@ -128,6 +129,12 @@ func (s *JobService) GetJob(ctx context.Context, jobID string) (*model.JobRespon
|
|||||||
took := time.Since(start)
|
took := time.Since(start)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
if slurm.IsNotFound(err) {
|
||||||
|
s.logger.Debug("job not in active queue, querying history",
|
||||||
|
zap.String("job_id", jobID),
|
||||||
|
)
|
||||||
|
return s.getJobFromHistory(ctx, jobID)
|
||||||
|
}
|
||||||
s.logger.Debug("slurm API error response",
|
s.logger.Debug("slurm API error response",
|
||||||
zap.String("operation", "GetJob"),
|
zap.String("operation", "GetJob"),
|
||||||
zap.String("job_id", jobID),
|
zap.String("job_id", jobID),
|
||||||
@@ -146,13 +153,49 @@ func (s *JobService) GetJob(ctx context.Context, jobID string) (*model.JobRespon
|
|||||||
)
|
)
|
||||||
|
|
||||||
if len(result.Jobs) == 0 {
|
if len(result.Jobs) == 0 {
|
||||||
return nil, nil
|
s.logger.Debug("empty jobs response, querying history",
|
||||||
|
zap.String("job_id", jobID),
|
||||||
|
)
|
||||||
|
return s.getJobFromHistory(ctx, jobID)
|
||||||
}
|
}
|
||||||
|
|
||||||
resp := mapJobInfo(&result.Jobs[0])
|
resp := mapJobInfo(&result.Jobs[0])
|
||||||
return &resp, nil
|
return &resp, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *JobService) getJobFromHistory(ctx context.Context, jobID string) (*model.JobResponse, error) {
|
||||||
|
start := time.Now()
|
||||||
|
result, _, err := s.client.SlurmdbJobs.GetJob(ctx, jobID)
|
||||||
|
took := time.Since(start)
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
s.logger.Debug("slurmdb API error response",
|
||||||
|
zap.String("operation", "getJobFromHistory"),
|
||||||
|
zap.String("job_id", jobID),
|
||||||
|
zap.Duration("took", took),
|
||||||
|
zap.Error(err),
|
||||||
|
)
|
||||||
|
if slurm.IsNotFound(err) {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
return nil, fmt.Errorf("get job history %s: %w", jobID, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
s.logger.Debug("slurmdb API response",
|
||||||
|
zap.String("operation", "getJobFromHistory"),
|
||||||
|
zap.String("job_id", jobID),
|
||||||
|
zap.Duration("took", took),
|
||||||
|
zap.Any("body", result),
|
||||||
|
)
|
||||||
|
|
||||||
|
if len(result.Jobs) == 0 {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
resp := mapSlurmdbJob(&result.Jobs[0])
|
||||||
|
return &resp, nil
|
||||||
|
}
|
||||||
|
|
||||||
// CancelJob cancels a job by ID.
|
// CancelJob cancels a job by ID.
|
||||||
func (s *JobService) CancelJob(ctx context.Context, jobID string) error {
|
func (s *JobService) CancelJob(ctx context.Context, jobID string) error {
|
||||||
s.logger.Debug("slurm API request",
|
s.logger.Debug("slurm API request",
|
||||||
@@ -209,6 +252,33 @@ func (s *JobService) GetJobHistory(ctx context.Context, query *model.JobHistoryQ
|
|||||||
if query.EndTime != "" {
|
if query.EndTime != "" {
|
||||||
opts.EndTime = strToPtr(query.EndTime)
|
opts.EndTime = strToPtr(query.EndTime)
|
||||||
}
|
}
|
||||||
|
if query.SubmitTime != "" {
|
||||||
|
opts.SubmitTime = strToPtr(query.SubmitTime)
|
||||||
|
}
|
||||||
|
if query.Cluster != "" {
|
||||||
|
opts.Cluster = strToPtr(query.Cluster)
|
||||||
|
}
|
||||||
|
if query.Qos != "" {
|
||||||
|
opts.Qos = strToPtr(query.Qos)
|
||||||
|
}
|
||||||
|
if query.Constraints != "" {
|
||||||
|
opts.Constraints = strToPtr(query.Constraints)
|
||||||
|
}
|
||||||
|
if query.ExitCode != "" {
|
||||||
|
opts.ExitCode = strToPtr(query.ExitCode)
|
||||||
|
}
|
||||||
|
if query.Node != "" {
|
||||||
|
opts.Node = strToPtr(query.Node)
|
||||||
|
}
|
||||||
|
if query.Reservation != "" {
|
||||||
|
opts.Reservation = strToPtr(query.Reservation)
|
||||||
|
}
|
||||||
|
if query.Groups != "" {
|
||||||
|
opts.Groups = strToPtr(query.Groups)
|
||||||
|
}
|
||||||
|
if query.Wckey != "" {
|
||||||
|
opts.Wckey = strToPtr(query.Wckey)
|
||||||
|
}
|
||||||
|
|
||||||
s.logger.Debug("slurm API request",
|
s.logger.Debug("slurm API request",
|
||||||
zap.String("operation", "GetJobHistory"),
|
zap.String("operation", "GetJobHistory"),
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ import (
|
|||||||
"encoding/json"
|
"encoding/json"
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/http/httptest"
|
"net/http/httptest"
|
||||||
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"gcy_hpc_server/internal/model"
|
"gcy_hpc_server/internal/model"
|
||||||
@@ -701,3 +702,157 @@ func TestJobService_GetJobHistory_ErrorLog(t *testing.T) {
|
|||||||
t.Error("expected error field in log entry")
|
t.Error("expected error field in log entry")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Fallback to SlurmDBD history tests
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
func TestGetJob_FallbackToHistory_Found(t *testing.T) {
|
||||||
|
jobID := int32(198)
|
||||||
|
name := "hist-job"
|
||||||
|
ts := int64(1700000000)
|
||||||
|
|
||||||
|
client, cleanup := mockJobServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
w.Header().Set("Content-Type", "application/json")
|
||||||
|
|
||||||
|
switch r.URL.Path {
|
||||||
|
case "/slurm/v0.0.40/job/198":
|
||||||
|
w.WriteHeader(http.StatusNotFound)
|
||||||
|
json.NewEncoder(w).Encode(map[string]interface{}{
|
||||||
|
"errors": []map[string]interface{}{
|
||||||
|
{
|
||||||
|
"description": "Unable to query JobId=198",
|
||||||
|
"error_number": float64(2017),
|
||||||
|
"error": "Invalid job id specified",
|
||||||
|
"source": "_handle_job_get",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"jobs": []interface{}{},
|
||||||
|
})
|
||||||
|
case "/slurmdb/v0.0.40/job/198":
|
||||||
|
resp := slurm.OpenapiSlurmdbdJobsResp{
|
||||||
|
Jobs: slurm.JobList{
|
||||||
|
{
|
||||||
|
JobID: &jobID,
|
||||||
|
Name: &name,
|
||||||
|
State: &slurm.JobState{Current: []string{"COMPLETED"}},
|
||||||
|
Time: &slurm.JobTime{Submission: &ts, Start: &ts, End: &ts},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
json.NewEncoder(w).Encode(resp)
|
||||||
|
default:
|
||||||
|
w.WriteHeader(http.StatusNotFound)
|
||||||
|
}
|
||||||
|
}))
|
||||||
|
defer cleanup()
|
||||||
|
|
||||||
|
svc := NewJobService(client, zap.NewNop())
|
||||||
|
job, err := svc.GetJob(context.Background(), "198")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("GetJob: %v", err)
|
||||||
|
}
|
||||||
|
if job == nil {
|
||||||
|
t.Fatal("expected job, got nil")
|
||||||
|
}
|
||||||
|
if job.JobID != 198 {
|
||||||
|
t.Errorf("expected JobID 198, got %d", job.JobID)
|
||||||
|
}
|
||||||
|
if job.Name != "hist-job" {
|
||||||
|
t.Errorf("expected Name hist-job, got %s", job.Name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestGetJob_FallbackToHistory_NotFound(t *testing.T) {
|
||||||
|
client, cleanup := mockJobServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
w.Header().Set("Content-Type", "application/json")
|
||||||
|
w.WriteHeader(http.StatusNotFound)
|
||||||
|
}))
|
||||||
|
defer cleanup()
|
||||||
|
|
||||||
|
svc := NewJobService(client, zap.NewNop())
|
||||||
|
job, err := svc.GetJob(context.Background(), "999")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("GetJob: %v", err)
|
||||||
|
}
|
||||||
|
if job != nil {
|
||||||
|
t.Errorf("expected nil, got %+v", job)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestGetJob_FallbackToHistory_HistoryError(t *testing.T) {
|
||||||
|
client, cleanup := mockJobServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
w.Header().Set("Content-Type", "application/json")
|
||||||
|
|
||||||
|
switch r.URL.Path {
|
||||||
|
case "/slurm/v0.0.40/job/500":
|
||||||
|
w.WriteHeader(http.StatusNotFound)
|
||||||
|
json.NewEncoder(w).Encode(map[string]interface{}{
|
||||||
|
"errors": []map[string]interface{}{
|
||||||
|
{
|
||||||
|
"description": "Unable to query JobId=500",
|
||||||
|
"error_number": float64(2017),
|
||||||
|
"error": "Invalid job id specified",
|
||||||
|
"source": "_handle_job_get",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"jobs": []interface{}{},
|
||||||
|
})
|
||||||
|
case "/slurmdb/v0.0.40/job/500":
|
||||||
|
w.WriteHeader(http.StatusInternalServerError)
|
||||||
|
w.Write([]byte(`{"errors":[{"error":"db error"}]}`))
|
||||||
|
default:
|
||||||
|
w.WriteHeader(http.StatusNotFound)
|
||||||
|
}
|
||||||
|
}))
|
||||||
|
defer cleanup()
|
||||||
|
|
||||||
|
svc := NewJobService(client, zap.NewNop())
|
||||||
|
job, err := svc.GetJob(context.Background(), "500")
|
||||||
|
if err == nil {
|
||||||
|
t.Fatal("expected error, got nil")
|
||||||
|
}
|
||||||
|
if job != nil {
|
||||||
|
t.Errorf("expected nil job, got %+v", job)
|
||||||
|
}
|
||||||
|
if !strings.Contains(err.Error(), "get job history") {
|
||||||
|
t.Errorf("expected error to contain 'get job history', got %s", err.Error())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestGetJob_FallbackToHistory_EmptyHistory(t *testing.T) {
|
||||||
|
client, cleanup := mockJobServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
w.Header().Set("Content-Type", "application/json")
|
||||||
|
|
||||||
|
switch r.URL.Path {
|
||||||
|
case "/slurm/v0.0.40/job/777":
|
||||||
|
w.WriteHeader(http.StatusNotFound)
|
||||||
|
json.NewEncoder(w).Encode(map[string]interface{}{
|
||||||
|
"errors": []map[string]interface{}{
|
||||||
|
{
|
||||||
|
"description": "Unable to query JobId=777",
|
||||||
|
"error_number": float64(2017),
|
||||||
|
"error": "Invalid job id specified",
|
||||||
|
"source": "_handle_job_get",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"jobs": []interface{}{},
|
||||||
|
})
|
||||||
|
case "/slurmdb/v0.0.40/job/777":
|
||||||
|
resp := slurm.OpenapiSlurmdbdJobsResp{Jobs: slurm.JobList{}}
|
||||||
|
json.NewEncoder(w).Encode(resp)
|
||||||
|
default:
|
||||||
|
w.WriteHeader(http.StatusNotFound)
|
||||||
|
}
|
||||||
|
}))
|
||||||
|
defer cleanup()
|
||||||
|
|
||||||
|
svc := NewJobService(client, zap.NewNop())
|
||||||
|
job, err := svc.GetJob(context.Background(), "777")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("GetJob: %v", err)
|
||||||
|
}
|
||||||
|
if job != nil {
|
||||||
|
t.Errorf("expected nil, got %+v", job)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user