diff --git a/internal/service/cluster_service.go b/internal/service/cluster_service.go index 3d1cdfa..3cd9426 100644 --- a/internal/service/cluster_service.go +++ b/internal/service/cluster_service.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "strconv" + "time" "gcy_hpc_server/internal/model" "gcy_hpc_server/internal/slurm" @@ -55,11 +56,30 @@ func NewClusterService(client *slurm.Client, logger *zap.Logger) *ClusterService } func (s *ClusterService) GetNodes(ctx context.Context) ([]model.NodeResponse, error) { + s.logger.Debug("slurm API request", + zap.String("operation", "GetNodes"), + ) + + start := time.Now() resp, _, err := s.client.Nodes.GetNodes(ctx, nil) + took := time.Since(start) + if err != nil { + s.logger.Debug("slurm API error response", + zap.String("operation", "GetNodes"), + zap.Duration("took", took), + zap.Error(err), + ) s.logger.Error("failed to get nodes", zap.Error(err)) return nil, fmt.Errorf("get nodes: %w", err) } + + s.logger.Debug("slurm API response", + zap.String("operation", "GetNodes"), + zap.Duration("took", took), + zap.Any("body", resp), + ) + if resp.Nodes == nil { return nil, nil } @@ -71,11 +91,33 @@ func (s *ClusterService) GetNodes(ctx context.Context) ([]model.NodeResponse, er } func (s *ClusterService) GetNode(ctx context.Context, name string) (*model.NodeResponse, error) { + s.logger.Debug("slurm API request", + zap.String("operation", "GetNode"), + zap.String("node_name", name), + ) + + start := time.Now() resp, _, err := s.client.Nodes.GetNode(ctx, name, nil) + took := time.Since(start) + if err != nil { + s.logger.Debug("slurm API error response", + zap.String("operation", "GetNode"), + zap.String("node_name", name), + zap.Duration("took", took), + zap.Error(err), + ) s.logger.Error("failed to get node", zap.String("name", name), zap.Error(err)) return nil, fmt.Errorf("get node %s: %w", name, err) } + + s.logger.Debug("slurm API response", + zap.String("operation", "GetNode"), + zap.String("node_name", name), + zap.Duration("took", took), + zap.Any("body", resp), + ) + if resp.Nodes == nil || len(*resp.Nodes) == 0 { return nil, nil } @@ -85,11 +127,30 @@ func (s *ClusterService) GetNode(ctx context.Context, name string) (*model.NodeR } func (s *ClusterService) GetPartitions(ctx context.Context) ([]model.PartitionResponse, error) { + s.logger.Debug("slurm API request", + zap.String("operation", "GetPartitions"), + ) + + start := time.Now() resp, _, err := s.client.Partitions.GetPartitions(ctx, nil) + took := time.Since(start) + if err != nil { + s.logger.Debug("slurm API error response", + zap.String("operation", "GetPartitions"), + zap.Duration("took", took), + zap.Error(err), + ) s.logger.Error("failed to get partitions", zap.Error(err)) return nil, fmt.Errorf("get partitions: %w", err) } + + s.logger.Debug("slurm API response", + zap.String("operation", "GetPartitions"), + zap.Duration("took", took), + zap.Any("body", resp), + ) + if resp.Partitions == nil { return nil, nil } @@ -101,11 +162,33 @@ func (s *ClusterService) GetPartitions(ctx context.Context) ([]model.PartitionRe } func (s *ClusterService) GetPartition(ctx context.Context, name string) (*model.PartitionResponse, error) { + s.logger.Debug("slurm API request", + zap.String("operation", "GetPartition"), + zap.String("partition_name", name), + ) + + start := time.Now() resp, _, err := s.client.Partitions.GetPartition(ctx, name, nil) + took := time.Since(start) + if err != nil { + s.logger.Debug("slurm API error response", + zap.String("operation", "GetPartition"), + zap.String("partition_name", name), + zap.Duration("took", took), + zap.Error(err), + ) s.logger.Error("failed to get partition", zap.String("name", name), zap.Error(err)) return nil, fmt.Errorf("get partition %s: %w", name, err) } + + s.logger.Debug("slurm API response", + zap.String("operation", "GetPartition"), + zap.String("partition_name", name), + zap.Duration("took", took), + zap.Any("body", resp), + ) + if resp.Partitions == nil || len(*resp.Partitions) == 0 { return nil, nil } @@ -115,11 +198,30 @@ func (s *ClusterService) GetPartition(ctx context.Context, name string) (*model. } func (s *ClusterService) GetDiag(ctx context.Context) (*slurm.OpenapiDiagResp, error) { + s.logger.Debug("slurm API request", + zap.String("operation", "GetDiag"), + ) + + start := time.Now() resp, _, err := s.client.Diag.GetDiag(ctx) + took := time.Since(start) + if err != nil { + s.logger.Debug("slurm API error response", + zap.String("operation", "GetDiag"), + zap.Duration("took", took), + zap.Error(err), + ) s.logger.Error("failed to get diag", zap.Error(err)) return nil, fmt.Errorf("get diag: %w", err) } + + s.logger.Debug("slurm API response", + zap.String("operation", "GetDiag"), + zap.Duration("took", took), + zap.Any("body", resp), + ) + return resp, nil } diff --git a/internal/service/job_service.go b/internal/service/job_service.go index 636ee99..fe5067f 100644 --- a/internal/service/job_service.go +++ b/internal/service/job_service.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "strconv" + "time" "gcy_hpc_server/internal/model" "gcy_hpc_server/internal/slurm" @@ -45,12 +46,31 @@ func (s *JobService) SubmitJob(ctx context.Context, req *model.SubmitJobRequest) Job: jobDesc, } + s.logger.Debug("slurm API request", + zap.String("operation", "SubmitJob"), + zap.Any("body", submitReq), + ) + + start := time.Now() result, _, err := s.client.Jobs.SubmitJob(ctx, submitReq) + took := time.Since(start) + if err != nil { + s.logger.Debug("slurm API error response", + zap.String("operation", "SubmitJob"), + zap.Duration("took", took), + zap.Error(err), + ) s.logger.Error("failed to submit job", zap.Error(err), zap.String("operation", "submit")) return nil, fmt.Errorf("submit job: %w", err) } + s.logger.Debug("slurm API response", + zap.String("operation", "SubmitJob"), + zap.Duration("took", took), + zap.Any("body", result), + ) + resp := &model.JobResponse{} if result.Result != nil && result.Result.JobID != nil { resp.JobID = *result.Result.JobID @@ -64,12 +84,31 @@ func (s *JobService) SubmitJob(ctx context.Context, req *model.SubmitJobRequest) // GetJobs lists all current jobs from Slurm. func (s *JobService) GetJobs(ctx context.Context) ([]model.JobResponse, error) { + s.logger.Debug("slurm API request", + zap.String("operation", "GetJobs"), + ) + + start := time.Now() result, _, err := s.client.Jobs.GetJobs(ctx, nil) + took := time.Since(start) + if err != nil { + s.logger.Debug("slurm API error response", + zap.String("operation", "GetJobs"), + zap.Duration("took", took), + zap.Error(err), + ) s.logger.Error("failed to get jobs", zap.Error(err), zap.String("operation", "get_jobs")) return nil, fmt.Errorf("get jobs: %w", err) } + s.logger.Debug("slurm API response", + zap.String("operation", "GetJobs"), + zap.Duration("took", took), + zap.Int("job_count", len(result.Jobs)), + zap.Any("body", result), + ) + jobs := make([]model.JobResponse, 0, len(result.Jobs)) for i := range result.Jobs { jobs = append(jobs, mapJobInfo(&result.Jobs[i])) @@ -79,12 +118,33 @@ func (s *JobService) GetJobs(ctx context.Context) ([]model.JobResponse, error) { // GetJob retrieves a single job by ID. func (s *JobService) GetJob(ctx context.Context, jobID string) (*model.JobResponse, error) { + s.logger.Debug("slurm API request", + zap.String("operation", "GetJob"), + zap.String("job_id", jobID), + ) + + start := time.Now() result, _, err := s.client.Jobs.GetJob(ctx, jobID, nil) + took := time.Since(start) + if err != nil { + s.logger.Debug("slurm API error response", + zap.String("operation", "GetJob"), + zap.String("job_id", jobID), + zap.Duration("took", took), + zap.Error(err), + ) s.logger.Error("failed to get job", zap.Error(err), zap.String("job_id", jobID), zap.String("operation", "get_job")) return nil, fmt.Errorf("get job %s: %w", jobID, err) } + s.logger.Debug("slurm API response", + zap.String("operation", "GetJob"), + zap.String("job_id", jobID), + zap.Duration("took", took), + zap.Any("body", result), + ) + if len(result.Jobs) == 0 { return nil, nil } @@ -95,11 +155,32 @@ func (s *JobService) GetJob(ctx context.Context, jobID string) (*model.JobRespon // CancelJob cancels a job by ID. func (s *JobService) CancelJob(ctx context.Context, jobID string) error { - _, _, err := s.client.Jobs.DeleteJob(ctx, jobID, nil) + s.logger.Debug("slurm API request", + zap.String("operation", "CancelJob"), + zap.String("job_id", jobID), + ) + + start := time.Now() + result, _, err := s.client.Jobs.DeleteJob(ctx, jobID, nil) + took := time.Since(start) + if err != nil { + s.logger.Debug("slurm API error response", + zap.String("operation", "CancelJob"), + zap.String("job_id", jobID), + zap.Duration("took", took), + zap.Error(err), + ) s.logger.Error("failed to cancel job", zap.Error(err), zap.String("job_id", jobID), zap.String("operation", "cancel")) return fmt.Errorf("cancel job %s: %w", jobID, err) } + + s.logger.Debug("slurm API response", + zap.String("operation", "CancelJob"), + zap.String("job_id", jobID), + zap.Duration("took", took), + zap.Any("body", result), + ) s.logger.Info("job cancelled", zap.String("job_id", jobID)) return nil } @@ -129,12 +210,32 @@ func (s *JobService) GetJobHistory(ctx context.Context, query *model.JobHistoryQ opts.EndTime = strToPtr(query.EndTime) } + s.logger.Debug("slurm API request", + zap.String("operation", "GetJobHistory"), + zap.Any("body", opts), + ) + + start := time.Now() result, _, err := s.client.SlurmdbJobs.GetJobs(ctx, opts) + took := time.Since(start) + if err != nil { + s.logger.Debug("slurm API error response", + zap.String("operation", "GetJobHistory"), + zap.Duration("took", took), + zap.Error(err), + ) s.logger.Error("failed to get job history", zap.Error(err), zap.String("operation", "get_job_history")) return nil, fmt.Errorf("get job history: %w", err) } + s.logger.Debug("slurm API response", + zap.String("operation", "GetJobHistory"), + zap.Duration("took", took), + zap.Int("job_count", len(result.Jobs)), + zap.Any("body", result), + ) + allJobs := make([]model.JobResponse, 0, len(result.Jobs)) for i := range result.Jobs { allJobs = append(allJobs, mapSlurmdbJob(&result.Jobs[i])) @@ -150,17 +251,17 @@ func (s *JobService) GetJobHistory(ctx context.Context, query *model.JobHistoryQ pageSize = 20 } - start := (page - 1) * pageSize - end := start + pageSize - if start > total { - start = total + startIdx := (page - 1) * pageSize + end := startIdx + pageSize + if startIdx > total { + startIdx = total } if end > total { end = total } return &model.JobListResponse{ - Jobs: allJobs[start:end], + Jobs: allJobs[startIdx:end], Total: total, Page: page, PageSize: pageSize,