feat(service): add debug logging for Slurm API calls with request/response body and latency
Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
@@ -4,6 +4,7 @@ import (
|
|||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
"strconv"
|
"strconv"
|
||||||
|
"time"
|
||||||
|
|
||||||
"gcy_hpc_server/internal/model"
|
"gcy_hpc_server/internal/model"
|
||||||
"gcy_hpc_server/internal/slurm"
|
"gcy_hpc_server/internal/slurm"
|
||||||
@@ -55,11 +56,30 @@ func NewClusterService(client *slurm.Client, logger *zap.Logger) *ClusterService
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (s *ClusterService) GetNodes(ctx context.Context) ([]model.NodeResponse, error) {
|
func (s *ClusterService) GetNodes(ctx context.Context) ([]model.NodeResponse, error) {
|
||||||
|
s.logger.Debug("slurm API request",
|
||||||
|
zap.String("operation", "GetNodes"),
|
||||||
|
)
|
||||||
|
|
||||||
|
start := time.Now()
|
||||||
resp, _, err := s.client.Nodes.GetNodes(ctx, nil)
|
resp, _, err := s.client.Nodes.GetNodes(ctx, nil)
|
||||||
|
took := time.Since(start)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
s.logger.Debug("slurm API error response",
|
||||||
|
zap.String("operation", "GetNodes"),
|
||||||
|
zap.Duration("took", took),
|
||||||
|
zap.Error(err),
|
||||||
|
)
|
||||||
s.logger.Error("failed to get nodes", zap.Error(err))
|
s.logger.Error("failed to get nodes", zap.Error(err))
|
||||||
return nil, fmt.Errorf("get nodes: %w", err)
|
return nil, fmt.Errorf("get nodes: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
s.logger.Debug("slurm API response",
|
||||||
|
zap.String("operation", "GetNodes"),
|
||||||
|
zap.Duration("took", took),
|
||||||
|
zap.Any("body", resp),
|
||||||
|
)
|
||||||
|
|
||||||
if resp.Nodes == nil {
|
if resp.Nodes == nil {
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
@@ -71,11 +91,33 @@ func (s *ClusterService) GetNodes(ctx context.Context) ([]model.NodeResponse, er
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (s *ClusterService) GetNode(ctx context.Context, name string) (*model.NodeResponse, error) {
|
func (s *ClusterService) GetNode(ctx context.Context, name string) (*model.NodeResponse, error) {
|
||||||
|
s.logger.Debug("slurm API request",
|
||||||
|
zap.String("operation", "GetNode"),
|
||||||
|
zap.String("node_name", name),
|
||||||
|
)
|
||||||
|
|
||||||
|
start := time.Now()
|
||||||
resp, _, err := s.client.Nodes.GetNode(ctx, name, nil)
|
resp, _, err := s.client.Nodes.GetNode(ctx, name, nil)
|
||||||
|
took := time.Since(start)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
s.logger.Debug("slurm API error response",
|
||||||
|
zap.String("operation", "GetNode"),
|
||||||
|
zap.String("node_name", name),
|
||||||
|
zap.Duration("took", took),
|
||||||
|
zap.Error(err),
|
||||||
|
)
|
||||||
s.logger.Error("failed to get node", zap.String("name", name), zap.Error(err))
|
s.logger.Error("failed to get node", zap.String("name", name), zap.Error(err))
|
||||||
return nil, fmt.Errorf("get node %s: %w", name, err)
|
return nil, fmt.Errorf("get node %s: %w", name, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
s.logger.Debug("slurm API response",
|
||||||
|
zap.String("operation", "GetNode"),
|
||||||
|
zap.String("node_name", name),
|
||||||
|
zap.Duration("took", took),
|
||||||
|
zap.Any("body", resp),
|
||||||
|
)
|
||||||
|
|
||||||
if resp.Nodes == nil || len(*resp.Nodes) == 0 {
|
if resp.Nodes == nil || len(*resp.Nodes) == 0 {
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
@@ -85,11 +127,30 @@ func (s *ClusterService) GetNode(ctx context.Context, name string) (*model.NodeR
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (s *ClusterService) GetPartitions(ctx context.Context) ([]model.PartitionResponse, error) {
|
func (s *ClusterService) GetPartitions(ctx context.Context) ([]model.PartitionResponse, error) {
|
||||||
|
s.logger.Debug("slurm API request",
|
||||||
|
zap.String("operation", "GetPartitions"),
|
||||||
|
)
|
||||||
|
|
||||||
|
start := time.Now()
|
||||||
resp, _, err := s.client.Partitions.GetPartitions(ctx, nil)
|
resp, _, err := s.client.Partitions.GetPartitions(ctx, nil)
|
||||||
|
took := time.Since(start)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
s.logger.Debug("slurm API error response",
|
||||||
|
zap.String("operation", "GetPartitions"),
|
||||||
|
zap.Duration("took", took),
|
||||||
|
zap.Error(err),
|
||||||
|
)
|
||||||
s.logger.Error("failed to get partitions", zap.Error(err))
|
s.logger.Error("failed to get partitions", zap.Error(err))
|
||||||
return nil, fmt.Errorf("get partitions: %w", err)
|
return nil, fmt.Errorf("get partitions: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
s.logger.Debug("slurm API response",
|
||||||
|
zap.String("operation", "GetPartitions"),
|
||||||
|
zap.Duration("took", took),
|
||||||
|
zap.Any("body", resp),
|
||||||
|
)
|
||||||
|
|
||||||
if resp.Partitions == nil {
|
if resp.Partitions == nil {
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
@@ -101,11 +162,33 @@ func (s *ClusterService) GetPartitions(ctx context.Context) ([]model.PartitionRe
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (s *ClusterService) GetPartition(ctx context.Context, name string) (*model.PartitionResponse, error) {
|
func (s *ClusterService) GetPartition(ctx context.Context, name string) (*model.PartitionResponse, error) {
|
||||||
|
s.logger.Debug("slurm API request",
|
||||||
|
zap.String("operation", "GetPartition"),
|
||||||
|
zap.String("partition_name", name),
|
||||||
|
)
|
||||||
|
|
||||||
|
start := time.Now()
|
||||||
resp, _, err := s.client.Partitions.GetPartition(ctx, name, nil)
|
resp, _, err := s.client.Partitions.GetPartition(ctx, name, nil)
|
||||||
|
took := time.Since(start)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
s.logger.Debug("slurm API error response",
|
||||||
|
zap.String("operation", "GetPartition"),
|
||||||
|
zap.String("partition_name", name),
|
||||||
|
zap.Duration("took", took),
|
||||||
|
zap.Error(err),
|
||||||
|
)
|
||||||
s.logger.Error("failed to get partition", zap.String("name", name), zap.Error(err))
|
s.logger.Error("failed to get partition", zap.String("name", name), zap.Error(err))
|
||||||
return nil, fmt.Errorf("get partition %s: %w", name, err)
|
return nil, fmt.Errorf("get partition %s: %w", name, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
s.logger.Debug("slurm API response",
|
||||||
|
zap.String("operation", "GetPartition"),
|
||||||
|
zap.String("partition_name", name),
|
||||||
|
zap.Duration("took", took),
|
||||||
|
zap.Any("body", resp),
|
||||||
|
)
|
||||||
|
|
||||||
if resp.Partitions == nil || len(*resp.Partitions) == 0 {
|
if resp.Partitions == nil || len(*resp.Partitions) == 0 {
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
@@ -115,11 +198,30 @@ func (s *ClusterService) GetPartition(ctx context.Context, name string) (*model.
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (s *ClusterService) GetDiag(ctx context.Context) (*slurm.OpenapiDiagResp, error) {
|
func (s *ClusterService) GetDiag(ctx context.Context) (*slurm.OpenapiDiagResp, error) {
|
||||||
|
s.logger.Debug("slurm API request",
|
||||||
|
zap.String("operation", "GetDiag"),
|
||||||
|
)
|
||||||
|
|
||||||
|
start := time.Now()
|
||||||
resp, _, err := s.client.Diag.GetDiag(ctx)
|
resp, _, err := s.client.Diag.GetDiag(ctx)
|
||||||
|
took := time.Since(start)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
s.logger.Debug("slurm API error response",
|
||||||
|
zap.String("operation", "GetDiag"),
|
||||||
|
zap.Duration("took", took),
|
||||||
|
zap.Error(err),
|
||||||
|
)
|
||||||
s.logger.Error("failed to get diag", zap.Error(err))
|
s.logger.Error("failed to get diag", zap.Error(err))
|
||||||
return nil, fmt.Errorf("get diag: %w", err)
|
return nil, fmt.Errorf("get diag: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
s.logger.Debug("slurm API response",
|
||||||
|
zap.String("operation", "GetDiag"),
|
||||||
|
zap.Duration("took", took),
|
||||||
|
zap.Any("body", resp),
|
||||||
|
)
|
||||||
|
|
||||||
return resp, nil
|
return resp, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ import (
|
|||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
"strconv"
|
"strconv"
|
||||||
|
"time"
|
||||||
|
|
||||||
"gcy_hpc_server/internal/model"
|
"gcy_hpc_server/internal/model"
|
||||||
"gcy_hpc_server/internal/slurm"
|
"gcy_hpc_server/internal/slurm"
|
||||||
@@ -45,12 +46,31 @@ func (s *JobService) SubmitJob(ctx context.Context, req *model.SubmitJobRequest)
|
|||||||
Job: jobDesc,
|
Job: jobDesc,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
s.logger.Debug("slurm API request",
|
||||||
|
zap.String("operation", "SubmitJob"),
|
||||||
|
zap.Any("body", submitReq),
|
||||||
|
)
|
||||||
|
|
||||||
|
start := time.Now()
|
||||||
result, _, err := s.client.Jobs.SubmitJob(ctx, submitReq)
|
result, _, err := s.client.Jobs.SubmitJob(ctx, submitReq)
|
||||||
|
took := time.Since(start)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
s.logger.Debug("slurm API error response",
|
||||||
|
zap.String("operation", "SubmitJob"),
|
||||||
|
zap.Duration("took", took),
|
||||||
|
zap.Error(err),
|
||||||
|
)
|
||||||
s.logger.Error("failed to submit job", zap.Error(err), zap.String("operation", "submit"))
|
s.logger.Error("failed to submit job", zap.Error(err), zap.String("operation", "submit"))
|
||||||
return nil, fmt.Errorf("submit job: %w", err)
|
return nil, fmt.Errorf("submit job: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
s.logger.Debug("slurm API response",
|
||||||
|
zap.String("operation", "SubmitJob"),
|
||||||
|
zap.Duration("took", took),
|
||||||
|
zap.Any("body", result),
|
||||||
|
)
|
||||||
|
|
||||||
resp := &model.JobResponse{}
|
resp := &model.JobResponse{}
|
||||||
if result.Result != nil && result.Result.JobID != nil {
|
if result.Result != nil && result.Result.JobID != nil {
|
||||||
resp.JobID = *result.Result.JobID
|
resp.JobID = *result.Result.JobID
|
||||||
@@ -64,12 +84,31 @@ func (s *JobService) SubmitJob(ctx context.Context, req *model.SubmitJobRequest)
|
|||||||
|
|
||||||
// GetJobs lists all current jobs from Slurm.
|
// GetJobs lists all current jobs from Slurm.
|
||||||
func (s *JobService) GetJobs(ctx context.Context) ([]model.JobResponse, error) {
|
func (s *JobService) GetJobs(ctx context.Context) ([]model.JobResponse, error) {
|
||||||
|
s.logger.Debug("slurm API request",
|
||||||
|
zap.String("operation", "GetJobs"),
|
||||||
|
)
|
||||||
|
|
||||||
|
start := time.Now()
|
||||||
result, _, err := s.client.Jobs.GetJobs(ctx, nil)
|
result, _, err := s.client.Jobs.GetJobs(ctx, nil)
|
||||||
|
took := time.Since(start)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
s.logger.Debug("slurm API error response",
|
||||||
|
zap.String("operation", "GetJobs"),
|
||||||
|
zap.Duration("took", took),
|
||||||
|
zap.Error(err),
|
||||||
|
)
|
||||||
s.logger.Error("failed to get jobs", zap.Error(err), zap.String("operation", "get_jobs"))
|
s.logger.Error("failed to get jobs", zap.Error(err), zap.String("operation", "get_jobs"))
|
||||||
return nil, fmt.Errorf("get jobs: %w", err)
|
return nil, fmt.Errorf("get jobs: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
s.logger.Debug("slurm API response",
|
||||||
|
zap.String("operation", "GetJobs"),
|
||||||
|
zap.Duration("took", took),
|
||||||
|
zap.Int("job_count", len(result.Jobs)),
|
||||||
|
zap.Any("body", result),
|
||||||
|
)
|
||||||
|
|
||||||
jobs := make([]model.JobResponse, 0, len(result.Jobs))
|
jobs := make([]model.JobResponse, 0, len(result.Jobs))
|
||||||
for i := range result.Jobs {
|
for i := range result.Jobs {
|
||||||
jobs = append(jobs, mapJobInfo(&result.Jobs[i]))
|
jobs = append(jobs, mapJobInfo(&result.Jobs[i]))
|
||||||
@@ -79,12 +118,33 @@ func (s *JobService) GetJobs(ctx context.Context) ([]model.JobResponse, error) {
|
|||||||
|
|
||||||
// GetJob retrieves a single job by ID.
|
// GetJob retrieves a single job by ID.
|
||||||
func (s *JobService) GetJob(ctx context.Context, jobID string) (*model.JobResponse, error) {
|
func (s *JobService) GetJob(ctx context.Context, jobID string) (*model.JobResponse, error) {
|
||||||
|
s.logger.Debug("slurm API request",
|
||||||
|
zap.String("operation", "GetJob"),
|
||||||
|
zap.String("job_id", jobID),
|
||||||
|
)
|
||||||
|
|
||||||
|
start := time.Now()
|
||||||
result, _, err := s.client.Jobs.GetJob(ctx, jobID, nil)
|
result, _, err := s.client.Jobs.GetJob(ctx, jobID, nil)
|
||||||
|
took := time.Since(start)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
s.logger.Debug("slurm API error response",
|
||||||
|
zap.String("operation", "GetJob"),
|
||||||
|
zap.String("job_id", jobID),
|
||||||
|
zap.Duration("took", took),
|
||||||
|
zap.Error(err),
|
||||||
|
)
|
||||||
s.logger.Error("failed to get job", zap.Error(err), zap.String("job_id", jobID), zap.String("operation", "get_job"))
|
s.logger.Error("failed to get job", zap.Error(err), zap.String("job_id", jobID), zap.String("operation", "get_job"))
|
||||||
return nil, fmt.Errorf("get job %s: %w", jobID, err)
|
return nil, fmt.Errorf("get job %s: %w", jobID, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
s.logger.Debug("slurm API response",
|
||||||
|
zap.String("operation", "GetJob"),
|
||||||
|
zap.String("job_id", jobID),
|
||||||
|
zap.Duration("took", took),
|
||||||
|
zap.Any("body", result),
|
||||||
|
)
|
||||||
|
|
||||||
if len(result.Jobs) == 0 {
|
if len(result.Jobs) == 0 {
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
@@ -95,11 +155,32 @@ func (s *JobService) GetJob(ctx context.Context, jobID string) (*model.JobRespon
|
|||||||
|
|
||||||
// CancelJob cancels a job by ID.
|
// CancelJob cancels a job by ID.
|
||||||
func (s *JobService) CancelJob(ctx context.Context, jobID string) error {
|
func (s *JobService) CancelJob(ctx context.Context, jobID string) error {
|
||||||
_, _, err := s.client.Jobs.DeleteJob(ctx, jobID, nil)
|
s.logger.Debug("slurm API request",
|
||||||
|
zap.String("operation", "CancelJob"),
|
||||||
|
zap.String("job_id", jobID),
|
||||||
|
)
|
||||||
|
|
||||||
|
start := time.Now()
|
||||||
|
result, _, err := s.client.Jobs.DeleteJob(ctx, jobID, nil)
|
||||||
|
took := time.Since(start)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
s.logger.Debug("slurm API error response",
|
||||||
|
zap.String("operation", "CancelJob"),
|
||||||
|
zap.String("job_id", jobID),
|
||||||
|
zap.Duration("took", took),
|
||||||
|
zap.Error(err),
|
||||||
|
)
|
||||||
s.logger.Error("failed to cancel job", zap.Error(err), zap.String("job_id", jobID), zap.String("operation", "cancel"))
|
s.logger.Error("failed to cancel job", zap.Error(err), zap.String("job_id", jobID), zap.String("operation", "cancel"))
|
||||||
return fmt.Errorf("cancel job %s: %w", jobID, err)
|
return fmt.Errorf("cancel job %s: %w", jobID, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
s.logger.Debug("slurm API response",
|
||||||
|
zap.String("operation", "CancelJob"),
|
||||||
|
zap.String("job_id", jobID),
|
||||||
|
zap.Duration("took", took),
|
||||||
|
zap.Any("body", result),
|
||||||
|
)
|
||||||
s.logger.Info("job cancelled", zap.String("job_id", jobID))
|
s.logger.Info("job cancelled", zap.String("job_id", jobID))
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@@ -129,12 +210,32 @@ func (s *JobService) GetJobHistory(ctx context.Context, query *model.JobHistoryQ
|
|||||||
opts.EndTime = strToPtr(query.EndTime)
|
opts.EndTime = strToPtr(query.EndTime)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
s.logger.Debug("slurm API request",
|
||||||
|
zap.String("operation", "GetJobHistory"),
|
||||||
|
zap.Any("body", opts),
|
||||||
|
)
|
||||||
|
|
||||||
|
start := time.Now()
|
||||||
result, _, err := s.client.SlurmdbJobs.GetJobs(ctx, opts)
|
result, _, err := s.client.SlurmdbJobs.GetJobs(ctx, opts)
|
||||||
|
took := time.Since(start)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
s.logger.Debug("slurm API error response",
|
||||||
|
zap.String("operation", "GetJobHistory"),
|
||||||
|
zap.Duration("took", took),
|
||||||
|
zap.Error(err),
|
||||||
|
)
|
||||||
s.logger.Error("failed to get job history", zap.Error(err), zap.String("operation", "get_job_history"))
|
s.logger.Error("failed to get job history", zap.Error(err), zap.String("operation", "get_job_history"))
|
||||||
return nil, fmt.Errorf("get job history: %w", err)
|
return nil, fmt.Errorf("get job history: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
s.logger.Debug("slurm API response",
|
||||||
|
zap.String("operation", "GetJobHistory"),
|
||||||
|
zap.Duration("took", took),
|
||||||
|
zap.Int("job_count", len(result.Jobs)),
|
||||||
|
zap.Any("body", result),
|
||||||
|
)
|
||||||
|
|
||||||
allJobs := make([]model.JobResponse, 0, len(result.Jobs))
|
allJobs := make([]model.JobResponse, 0, len(result.Jobs))
|
||||||
for i := range result.Jobs {
|
for i := range result.Jobs {
|
||||||
allJobs = append(allJobs, mapSlurmdbJob(&result.Jobs[i]))
|
allJobs = append(allJobs, mapSlurmdbJob(&result.Jobs[i]))
|
||||||
@@ -150,17 +251,17 @@ func (s *JobService) GetJobHistory(ctx context.Context, query *model.JobHistoryQ
|
|||||||
pageSize = 20
|
pageSize = 20
|
||||||
}
|
}
|
||||||
|
|
||||||
start := (page - 1) * pageSize
|
startIdx := (page - 1) * pageSize
|
||||||
end := start + pageSize
|
end := startIdx + pageSize
|
||||||
if start > total {
|
if startIdx > total {
|
||||||
start = total
|
startIdx = total
|
||||||
}
|
}
|
||||||
if end > total {
|
if end > total {
|
||||||
end = total
|
end = total
|
||||||
}
|
}
|
||||||
|
|
||||||
return &model.JobListResponse{
|
return &model.JobListResponse{
|
||||||
Jobs: allJobs[start:end],
|
Jobs: allJobs[startIdx:end],
|
||||||
Total: total,
|
Total: total,
|
||||||
Page: page,
|
Page: page,
|
||||||
PageSize: pageSize,
|
PageSize: pageSize,
|
||||||
|
|||||||
Reference in New Issue
Block a user