diff --git a/internal/model/job.go b/internal/model/job.go index a3383cd..0c14342 100644 --- a/internal/model/job.go +++ b/internal/model/job.go @@ -68,13 +68,22 @@ type JobListResponse struct { // JobHistoryQuery contains query parameters for job history. type JobHistoryQuery struct { - Users string `form:"users" json:"users,omitempty"` - StartTime string `form:"start_time" json:"start_time,omitempty"` - EndTime string `form:"end_time" json:"end_time,omitempty"` - Account string `form:"account" json:"account,omitempty"` - Partition string `form:"partition" json:"partition,omitempty"` - State string `form:"state" json:"state,omitempty"` - JobName string `form:"job_name" json:"job_name,omitempty"` - Page int `form:"page,default=1" json:"page,omitempty"` - PageSize int `form:"page_size,default=20" json:"page_size,omitempty"` + Users string `form:"users" json:"users,omitempty"` + StartTime string `form:"start_time" json:"start_time,omitempty"` + EndTime string `form:"end_time" json:"end_time,omitempty"` + SubmitTime string `form:"submit_time" json:"submit_time,omitempty"` + Account string `form:"account" json:"account,omitempty"` + Partition string `form:"partition" json:"partition,omitempty"` + State string `form:"state" json:"state,omitempty"` + JobName string `form:"job_name" json:"job_name,omitempty"` + Cluster string `form:"cluster" json:"cluster,omitempty"` + Qos string `form:"qos" json:"qos,omitempty"` + Constraints string `form:"constraints" json:"constraints,omitempty"` + ExitCode string `form:"exit_code" json:"exit_code,omitempty"` + Node string `form:"node" json:"node,omitempty"` + Reservation string `form:"reservation" json:"reservation,omitempty"` + Groups string `form:"groups" json:"groups,omitempty"` + Wckey string `form:"wckey" json:"wckey,omitempty"` + Page int `form:"page,default=1" json:"page,omitempty"` + PageSize int `form:"page_size,default=20" json:"page_size,omitempty"` } diff --git a/internal/service/job_service.go b/internal/service/job_service.go index 63e9a4e..0454058 100644 --- a/internal/service/job_service.go +++ b/internal/service/job_service.go @@ -116,7 +116,8 @@ func (s *JobService) GetJobs(ctx context.Context) ([]model.JobResponse, error) { return jobs, nil } -// GetJob retrieves a single job by ID. +// GetJob retrieves a single job by ID. If the job is not found in the active +// queue (404 or empty result), it falls back to querying SlurmDBD history. func (s *JobService) GetJob(ctx context.Context, jobID string) (*model.JobResponse, error) { s.logger.Debug("slurm API request", zap.String("operation", "GetJob"), @@ -128,6 +129,12 @@ func (s *JobService) GetJob(ctx context.Context, jobID string) (*model.JobRespon took := time.Since(start) if err != nil { + if slurm.IsNotFound(err) { + s.logger.Debug("job not in active queue, querying history", + zap.String("job_id", jobID), + ) + return s.getJobFromHistory(ctx, jobID) + } s.logger.Debug("slurm API error response", zap.String("operation", "GetJob"), zap.String("job_id", jobID), @@ -146,13 +153,49 @@ func (s *JobService) GetJob(ctx context.Context, jobID string) (*model.JobRespon ) if len(result.Jobs) == 0 { - return nil, nil + s.logger.Debug("empty jobs response, querying history", + zap.String("job_id", jobID), + ) + return s.getJobFromHistory(ctx, jobID) } resp := mapJobInfo(&result.Jobs[0]) return &resp, nil } +func (s *JobService) getJobFromHistory(ctx context.Context, jobID string) (*model.JobResponse, error) { + start := time.Now() + result, _, err := s.client.SlurmdbJobs.GetJob(ctx, jobID) + took := time.Since(start) + + if err != nil { + s.logger.Debug("slurmdb API error response", + zap.String("operation", "getJobFromHistory"), + zap.String("job_id", jobID), + zap.Duration("took", took), + zap.Error(err), + ) + if slurm.IsNotFound(err) { + return nil, nil + } + return nil, fmt.Errorf("get job history %s: %w", jobID, err) + } + + s.logger.Debug("slurmdb API response", + zap.String("operation", "getJobFromHistory"), + zap.String("job_id", jobID), + zap.Duration("took", took), + zap.Any("body", result), + ) + + if len(result.Jobs) == 0 { + return nil, nil + } + + resp := mapSlurmdbJob(&result.Jobs[0]) + return &resp, nil +} + // CancelJob cancels a job by ID. func (s *JobService) CancelJob(ctx context.Context, jobID string) error { s.logger.Debug("slurm API request", @@ -209,6 +252,33 @@ func (s *JobService) GetJobHistory(ctx context.Context, query *model.JobHistoryQ if query.EndTime != "" { opts.EndTime = strToPtr(query.EndTime) } + if query.SubmitTime != "" { + opts.SubmitTime = strToPtr(query.SubmitTime) + } + if query.Cluster != "" { + opts.Cluster = strToPtr(query.Cluster) + } + if query.Qos != "" { + opts.Qos = strToPtr(query.Qos) + } + if query.Constraints != "" { + opts.Constraints = strToPtr(query.Constraints) + } + if query.ExitCode != "" { + opts.ExitCode = strToPtr(query.ExitCode) + } + if query.Node != "" { + opts.Node = strToPtr(query.Node) + } + if query.Reservation != "" { + opts.Reservation = strToPtr(query.Reservation) + } + if query.Groups != "" { + opts.Groups = strToPtr(query.Groups) + } + if query.Wckey != "" { + opts.Wckey = strToPtr(query.Wckey) + } s.logger.Debug("slurm API request", zap.String("operation", "GetJobHistory"), diff --git a/internal/service/job_service_test.go b/internal/service/job_service_test.go index b327da6..7efc295 100644 --- a/internal/service/job_service_test.go +++ b/internal/service/job_service_test.go @@ -5,6 +5,7 @@ import ( "encoding/json" "net/http" "net/http/httptest" + "strings" "testing" "gcy_hpc_server/internal/model" @@ -701,3 +702,157 @@ func TestJobService_GetJobHistory_ErrorLog(t *testing.T) { t.Error("expected error field in log entry") } } + +// --------------------------------------------------------------------------- +// Fallback to SlurmDBD history tests +// --------------------------------------------------------------------------- + +func TestGetJob_FallbackToHistory_Found(t *testing.T) { + jobID := int32(198) + name := "hist-job" + ts := int64(1700000000) + + client, cleanup := mockJobServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + + switch r.URL.Path { + case "/slurm/v0.0.40/job/198": + w.WriteHeader(http.StatusNotFound) + json.NewEncoder(w).Encode(map[string]interface{}{ + "errors": []map[string]interface{}{ + { + "description": "Unable to query JobId=198", + "error_number": float64(2017), + "error": "Invalid job id specified", + "source": "_handle_job_get", + }, + }, + "jobs": []interface{}{}, + }) + case "/slurmdb/v0.0.40/job/198": + resp := slurm.OpenapiSlurmdbdJobsResp{ + Jobs: slurm.JobList{ + { + JobID: &jobID, + Name: &name, + State: &slurm.JobState{Current: []string{"COMPLETED"}}, + Time: &slurm.JobTime{Submission: &ts, Start: &ts, End: &ts}, + }, + }, + } + json.NewEncoder(w).Encode(resp) + default: + w.WriteHeader(http.StatusNotFound) + } + })) + defer cleanup() + + svc := NewJobService(client, zap.NewNop()) + job, err := svc.GetJob(context.Background(), "198") + if err != nil { + t.Fatalf("GetJob: %v", err) + } + if job == nil { + t.Fatal("expected job, got nil") + } + if job.JobID != 198 { + t.Errorf("expected JobID 198, got %d", job.JobID) + } + if job.Name != "hist-job" { + t.Errorf("expected Name hist-job, got %s", job.Name) + } +} + +func TestGetJob_FallbackToHistory_NotFound(t *testing.T) { + client, cleanup := mockJobServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusNotFound) + })) + defer cleanup() + + svc := NewJobService(client, zap.NewNop()) + job, err := svc.GetJob(context.Background(), "999") + if err != nil { + t.Fatalf("GetJob: %v", err) + } + if job != nil { + t.Errorf("expected nil, got %+v", job) + } +} + +func TestGetJob_FallbackToHistory_HistoryError(t *testing.T) { + client, cleanup := mockJobServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + + switch r.URL.Path { + case "/slurm/v0.0.40/job/500": + w.WriteHeader(http.StatusNotFound) + json.NewEncoder(w).Encode(map[string]interface{}{ + "errors": []map[string]interface{}{ + { + "description": "Unable to query JobId=500", + "error_number": float64(2017), + "error": "Invalid job id specified", + "source": "_handle_job_get", + }, + }, + "jobs": []interface{}{}, + }) + case "/slurmdb/v0.0.40/job/500": + w.WriteHeader(http.StatusInternalServerError) + w.Write([]byte(`{"errors":[{"error":"db error"}]}`)) + default: + w.WriteHeader(http.StatusNotFound) + } + })) + defer cleanup() + + svc := NewJobService(client, zap.NewNop()) + job, err := svc.GetJob(context.Background(), "500") + if err == nil { + t.Fatal("expected error, got nil") + } + if job != nil { + t.Errorf("expected nil job, got %+v", job) + } + if !strings.Contains(err.Error(), "get job history") { + t.Errorf("expected error to contain 'get job history', got %s", err.Error()) + } +} + +func TestGetJob_FallbackToHistory_EmptyHistory(t *testing.T) { + client, cleanup := mockJobServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + + switch r.URL.Path { + case "/slurm/v0.0.40/job/777": + w.WriteHeader(http.StatusNotFound) + json.NewEncoder(w).Encode(map[string]interface{}{ + "errors": []map[string]interface{}{ + { + "description": "Unable to query JobId=777", + "error_number": float64(2017), + "error": "Invalid job id specified", + "source": "_handle_job_get", + }, + }, + "jobs": []interface{}{}, + }) + case "/slurmdb/v0.0.40/job/777": + resp := slurm.OpenapiSlurmdbdJobsResp{Jobs: slurm.JobList{}} + json.NewEncoder(w).Encode(resp) + default: + w.WriteHeader(http.StatusNotFound) + } + })) + defer cleanup() + + svc := NewJobService(client, zap.NewNop()) + job, err := svc.GetJob(context.Background(), "777") + if err != nil { + t.Fatalf("GetJob: %v", err) + } + if job != nil { + t.Errorf("expected nil, got %+v", job) + } +}