Files
hpc/internal/service/cluster_service.go

270 lines
6.2 KiB
Go

package service
import (
"context"
"fmt"
"strconv"
"time"
"gcy_hpc_server/internal/model"
"gcy_hpc_server/internal/slurm"
"go.uber.org/zap"
)
func derefStr(s *string) string {
if s == nil {
return ""
}
return *s
}
func derefInt32(i *int32) int32 {
if i == nil {
return 0
}
return *i
}
func derefInt64(i *int64) int64 {
if i == nil {
return 0
}
return *i
}
func uint32NoValString(v *slurm.Uint32NoVal) string {
if v == nil {
return ""
}
if v.Infinite != nil && *v.Infinite {
return "UNLIMITED"
}
if v.Number != nil {
return strconv.FormatInt(*v.Number, 10)
}
return ""
}
type ClusterService struct {
client *slurm.Client
logger *zap.Logger
}
func NewClusterService(client *slurm.Client, logger *zap.Logger) *ClusterService {
return &ClusterService{client: client, logger: logger}
}
func (s *ClusterService) GetNodes(ctx context.Context) ([]model.NodeResponse, error) {
s.logger.Debug("slurm API request",
zap.String("operation", "GetNodes"),
)
start := time.Now()
resp, _, err := s.client.Nodes.GetNodes(ctx, nil)
took := time.Since(start)
if err != nil {
s.logger.Debug("slurm API error response",
zap.String("operation", "GetNodes"),
zap.Duration("took", took),
zap.Error(err),
)
s.logger.Error("failed to get nodes", zap.Error(err))
return nil, fmt.Errorf("get nodes: %w", err)
}
s.logger.Debug("slurm API response",
zap.String("operation", "GetNodes"),
zap.Duration("took", took),
zap.Any("body", resp),
)
if resp.Nodes == nil {
return nil, nil
}
result := make([]model.NodeResponse, 0, len(*resp.Nodes))
for _, n := range *resp.Nodes {
result = append(result, mapNode(n))
}
return result, nil
}
func (s *ClusterService) GetNode(ctx context.Context, name string) (*model.NodeResponse, error) {
s.logger.Debug("slurm API request",
zap.String("operation", "GetNode"),
zap.String("node_name", name),
)
start := time.Now()
resp, _, err := s.client.Nodes.GetNode(ctx, name, nil)
took := time.Since(start)
if err != nil {
s.logger.Debug("slurm API error response",
zap.String("operation", "GetNode"),
zap.String("node_name", name),
zap.Duration("took", took),
zap.Error(err),
)
s.logger.Error("failed to get node", zap.String("name", name), zap.Error(err))
return nil, fmt.Errorf("get node %s: %w", name, err)
}
s.logger.Debug("slurm API response",
zap.String("operation", "GetNode"),
zap.String("node_name", name),
zap.Duration("took", took),
zap.Any("body", resp),
)
if resp.Nodes == nil || len(*resp.Nodes) == 0 {
return nil, nil
}
n := (*resp.Nodes)[0]
mapped := mapNode(n)
return &mapped, nil
}
func (s *ClusterService) GetPartitions(ctx context.Context) ([]model.PartitionResponse, error) {
s.logger.Debug("slurm API request",
zap.String("operation", "GetPartitions"),
)
start := time.Now()
resp, _, err := s.client.Partitions.GetPartitions(ctx, nil)
took := time.Since(start)
if err != nil {
s.logger.Debug("slurm API error response",
zap.String("operation", "GetPartitions"),
zap.Duration("took", took),
zap.Error(err),
)
s.logger.Error("failed to get partitions", zap.Error(err))
return nil, fmt.Errorf("get partitions: %w", err)
}
s.logger.Debug("slurm API response",
zap.String("operation", "GetPartitions"),
zap.Duration("took", took),
zap.Any("body", resp),
)
if resp.Partitions == nil {
return nil, nil
}
result := make([]model.PartitionResponse, 0, len(*resp.Partitions))
for _, pi := range *resp.Partitions {
result = append(result, mapPartition(pi))
}
return result, nil
}
func (s *ClusterService) GetPartition(ctx context.Context, name string) (*model.PartitionResponse, error) {
s.logger.Debug("slurm API request",
zap.String("operation", "GetPartition"),
zap.String("partition_name", name),
)
start := time.Now()
resp, _, err := s.client.Partitions.GetPartition(ctx, name, nil)
took := time.Since(start)
if err != nil {
s.logger.Debug("slurm API error response",
zap.String("operation", "GetPartition"),
zap.String("partition_name", name),
zap.Duration("took", took),
zap.Error(err),
)
s.logger.Error("failed to get partition", zap.String("name", name), zap.Error(err))
return nil, fmt.Errorf("get partition %s: %w", name, err)
}
s.logger.Debug("slurm API response",
zap.String("operation", "GetPartition"),
zap.String("partition_name", name),
zap.Duration("took", took),
zap.Any("body", resp),
)
if resp.Partitions == nil || len(*resp.Partitions) == 0 {
return nil, nil
}
p := (*resp.Partitions)[0]
mapped := mapPartition(p)
return &mapped, nil
}
func (s *ClusterService) GetDiag(ctx context.Context) (*slurm.OpenapiDiagResp, error) {
s.logger.Debug("slurm API request",
zap.String("operation", "GetDiag"),
)
start := time.Now()
resp, _, err := s.client.Diag.GetDiag(ctx)
took := time.Since(start)
if err != nil {
s.logger.Debug("slurm API error response",
zap.String("operation", "GetDiag"),
zap.Duration("took", took),
zap.Error(err),
)
s.logger.Error("failed to get diag", zap.Error(err))
return nil, fmt.Errorf("get diag: %w", err)
}
s.logger.Debug("slurm API response",
zap.String("operation", "GetDiag"),
zap.Duration("took", took),
zap.Any("body", resp),
)
return resp, nil
}
func mapNode(n slurm.Node) model.NodeResponse {
return model.NodeResponse{
Name: derefStr(n.Name),
State: n.State,
CPUs: derefInt32(n.Cpus),
RealMemory: derefInt64(n.RealMemory),
AllocMem: derefInt64(n.AllocMemory),
Arch: derefStr(n.Architecture),
OS: derefStr(n.OperatingSystem),
}
}
func mapPartition(pi slurm.PartitionInfo) model.PartitionResponse {
var state []string
if pi.Partition != nil {
state = pi.Partition.State
}
var nodes string
if pi.Nodes != nil {
nodes = derefStr(pi.Nodes.Configured)
}
var totalCPUs int32
if pi.CPUs != nil {
totalCPUs = derefInt32(pi.CPUs.Total)
}
var totalNodes int32
if pi.Nodes != nil {
totalNodes = derefInt32(pi.Nodes.Total)
}
var maxTime string
if pi.Maximums != nil {
maxTime = uint32NoValString(pi.Maximums.Time)
}
return model.PartitionResponse{
Name: derefStr(pi.Name),
State: state,
Nodes: nodes,
TotalCPUs: totalCPUs,
TotalNodes: totalNodes,
MaxTime: maxTime,
}
}