Add scheduling_map field to ParameterSchema so Application creators can declare that a parameter (e.g. NP) maps to a scheduling field (e.g. cpus). The backend auto-injects the scheduling value into script template variables before rendering, eliminating duplicate user input. The frontend hides mapped parameters from the form and injects their values on submit.
373 lines
8.7 KiB
Go
373 lines
8.7 KiB
Go
package service
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"strconv"
|
|
"time"
|
|
|
|
"gcy_hpc_server/internal/model"
|
|
"gcy_hpc_server/internal/slurm"
|
|
|
|
"go.uber.org/zap"
|
|
)
|
|
|
|
func derefStr(s *string) string {
|
|
if s == nil {
|
|
return ""
|
|
}
|
|
return *s
|
|
}
|
|
|
|
func derefInt32(i *int32) int32 {
|
|
if i == nil {
|
|
return 0
|
|
}
|
|
return *i
|
|
}
|
|
|
|
func int32Ptr(i int32) *int32 { return &i }
|
|
|
|
func derefInt64(i *int64) int64 {
|
|
if i == nil {
|
|
return 0
|
|
}
|
|
return *i
|
|
}
|
|
|
|
func derefInt32ToStr(i *int32) string {
|
|
if i == nil {
|
|
return ""
|
|
}
|
|
return strconv.FormatInt(int64(*i), 10)
|
|
}
|
|
|
|
func derefInt64ToStr(i *int64) string {
|
|
if i == nil {
|
|
return ""
|
|
}
|
|
return strconv.FormatInt(*i, 10)
|
|
}
|
|
|
|
func uint32NoValString(v *slurm.Uint32NoVal) string {
|
|
if v == nil {
|
|
return ""
|
|
}
|
|
if v.Infinite != nil && *v.Infinite {
|
|
return "UNLIMITED"
|
|
}
|
|
if v.Number != nil {
|
|
return strconv.FormatInt(*v.Number, 10)
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func derefUint64NoValInt64(v *slurm.Uint64NoVal) *int64 {
|
|
if v != nil && v.Number != nil {
|
|
return v.Number
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func derefCSVString(cs *slurm.CSVString) string {
|
|
if cs == nil || len(*cs) == 0 {
|
|
return ""
|
|
}
|
|
result := ""
|
|
for i, s := range *cs {
|
|
if i > 0 {
|
|
result += ","
|
|
}
|
|
result += s
|
|
}
|
|
return result
|
|
}
|
|
|
|
type ClusterService struct {
|
|
client *slurm.Client
|
|
logger *zap.Logger
|
|
}
|
|
|
|
func NewClusterService(client *slurm.Client, logger *zap.Logger) *ClusterService {
|
|
return &ClusterService{client: client, logger: logger}
|
|
}
|
|
|
|
func (s *ClusterService) GetNodes(ctx context.Context) ([]model.NodeResponse, error) {
|
|
s.logger.Debug("slurm API request",
|
|
zap.String("operation", "GetNodes"),
|
|
)
|
|
|
|
start := time.Now()
|
|
resp, _, err := s.client.Nodes.GetNodes(ctx, nil)
|
|
took := time.Since(start)
|
|
|
|
if err != nil {
|
|
s.logger.Debug("slurm API error response",
|
|
zap.String("operation", "GetNodes"),
|
|
zap.Duration("took", took),
|
|
zap.Error(err),
|
|
)
|
|
s.logger.Error("failed to get nodes", zap.Error(err))
|
|
return nil, fmt.Errorf("get nodes: %w", err)
|
|
}
|
|
|
|
s.logger.Debug("slurm API response",
|
|
zap.String("operation", "GetNodes"),
|
|
zap.Duration("took", took),
|
|
zap.Any("body", resp),
|
|
)
|
|
|
|
if resp.Nodes == nil {
|
|
return nil, nil
|
|
}
|
|
result := make([]model.NodeResponse, 0, len(*resp.Nodes))
|
|
for _, n := range *resp.Nodes {
|
|
result = append(result, mapNode(n))
|
|
}
|
|
return result, nil
|
|
}
|
|
|
|
func (s *ClusterService) GetNode(ctx context.Context, name string) (*model.NodeResponse, error) {
|
|
s.logger.Debug("slurm API request",
|
|
zap.String("operation", "GetNode"),
|
|
zap.String("node_name", name),
|
|
)
|
|
|
|
start := time.Now()
|
|
resp, _, err := s.client.Nodes.GetNode(ctx, name, nil)
|
|
took := time.Since(start)
|
|
|
|
if err != nil {
|
|
s.logger.Debug("slurm API error response",
|
|
zap.String("operation", "GetNode"),
|
|
zap.String("node_name", name),
|
|
zap.Duration("took", took),
|
|
zap.Error(err),
|
|
)
|
|
s.logger.Error("failed to get node", zap.String("name", name), zap.Error(err))
|
|
return nil, fmt.Errorf("get node %s: %w", name, err)
|
|
}
|
|
|
|
s.logger.Debug("slurm API response",
|
|
zap.String("operation", "GetNode"),
|
|
zap.String("node_name", name),
|
|
zap.Duration("took", took),
|
|
zap.Any("body", resp),
|
|
)
|
|
|
|
if resp.Nodes == nil || len(*resp.Nodes) == 0 {
|
|
return nil, nil
|
|
}
|
|
n := (*resp.Nodes)[0]
|
|
mapped := mapNode(n)
|
|
return &mapped, nil
|
|
}
|
|
|
|
func (s *ClusterService) GetPartitions(ctx context.Context) ([]model.PartitionResponse, error) {
|
|
s.logger.Debug("slurm API request",
|
|
zap.String("operation", "GetPartitions"),
|
|
)
|
|
|
|
start := time.Now()
|
|
resp, _, err := s.client.Partitions.GetPartitions(ctx, nil)
|
|
took := time.Since(start)
|
|
|
|
if err != nil {
|
|
s.logger.Debug("slurm API error response",
|
|
zap.String("operation", "GetPartitions"),
|
|
zap.Duration("took", took),
|
|
zap.Error(err),
|
|
)
|
|
s.logger.Error("failed to get partitions", zap.Error(err))
|
|
return nil, fmt.Errorf("get partitions: %w", err)
|
|
}
|
|
|
|
s.logger.Debug("slurm API response",
|
|
zap.String("operation", "GetPartitions"),
|
|
zap.Duration("took", took),
|
|
zap.Any("body", resp),
|
|
)
|
|
|
|
if resp.Partitions == nil {
|
|
return nil, nil
|
|
}
|
|
result := make([]model.PartitionResponse, 0, len(*resp.Partitions))
|
|
for _, pi := range *resp.Partitions {
|
|
result = append(result, mapPartition(pi))
|
|
}
|
|
return result, nil
|
|
}
|
|
|
|
func (s *ClusterService) GetPartition(ctx context.Context, name string) (*model.PartitionResponse, error) {
|
|
s.logger.Debug("slurm API request",
|
|
zap.String("operation", "GetPartition"),
|
|
zap.String("partition_name", name),
|
|
)
|
|
|
|
start := time.Now()
|
|
resp, _, err := s.client.Partitions.GetPartition(ctx, name, nil)
|
|
took := time.Since(start)
|
|
|
|
if err != nil {
|
|
s.logger.Debug("slurm API error response",
|
|
zap.String("operation", "GetPartition"),
|
|
zap.String("partition_name", name),
|
|
zap.Duration("took", took),
|
|
zap.Error(err),
|
|
)
|
|
s.logger.Error("failed to get partition", zap.String("name", name), zap.Error(err))
|
|
return nil, fmt.Errorf("get partition %s: %w", name, err)
|
|
}
|
|
|
|
s.logger.Debug("slurm API response",
|
|
zap.String("operation", "GetPartition"),
|
|
zap.String("partition_name", name),
|
|
zap.Duration("took", took),
|
|
zap.Any("body", resp),
|
|
)
|
|
|
|
if resp.Partitions == nil || len(*resp.Partitions) == 0 {
|
|
return nil, nil
|
|
}
|
|
p := (*resp.Partitions)[0]
|
|
mapped := mapPartition(p)
|
|
return &mapped, nil
|
|
}
|
|
|
|
func (s *ClusterService) GetDiag(ctx context.Context) (*slurm.OpenapiDiagResp, error) {
|
|
s.logger.Debug("slurm API request",
|
|
zap.String("operation", "GetDiag"),
|
|
)
|
|
|
|
start := time.Now()
|
|
resp, _, err := s.client.Diag.GetDiag(ctx)
|
|
took := time.Since(start)
|
|
|
|
if err != nil {
|
|
s.logger.Debug("slurm API error response",
|
|
zap.String("operation", "GetDiag"),
|
|
zap.Duration("took", took),
|
|
zap.Error(err),
|
|
)
|
|
s.logger.Error("failed to get diag", zap.Error(err))
|
|
return nil, fmt.Errorf("get diag: %w", err)
|
|
}
|
|
|
|
s.logger.Debug("slurm API response",
|
|
zap.String("operation", "GetDiag"),
|
|
zap.Duration("took", took),
|
|
zap.Any("body", resp),
|
|
)
|
|
|
|
return resp, nil
|
|
}
|
|
|
|
func mapNode(n slurm.Node) model.NodeResponse {
|
|
return model.NodeResponse{
|
|
Name: derefStr(n.Name),
|
|
State: n.State,
|
|
CPUs: derefInt32(n.Cpus),
|
|
AllocCpus: n.AllocCpus,
|
|
Cores: n.Cores,
|
|
Sockets: n.Sockets,
|
|
Threads: n.Threads,
|
|
RealMemory: derefInt64(n.RealMemory),
|
|
AllocMemory: derefInt64(n.AllocMemory),
|
|
FreeMem: derefUint64NoValInt64(n.FreeMem),
|
|
CpuLoad: n.CpuLoad,
|
|
Arch: derefStr(n.Architecture),
|
|
OS: derefStr(n.OperatingSystem),
|
|
Gres: derefStr(n.Gres),
|
|
GresUsed: derefStr(n.GresUsed),
|
|
Reason: derefStr(n.Reason),
|
|
ReasonSetByUser: derefStr(n.ReasonSetByUser),
|
|
Address: derefStr(n.Address),
|
|
Hostname: derefStr(n.Hostname),
|
|
Weight: n.Weight,
|
|
Features: derefCSVString(n.Features),
|
|
ActiveFeatures: derefCSVString(n.ActiveFeatures),
|
|
}
|
|
}
|
|
|
|
func mapPartition(pi slurm.PartitionInfo) model.PartitionResponse {
|
|
var state []string
|
|
var isDefault bool
|
|
if pi.Partition != nil {
|
|
state = pi.Partition.State
|
|
for _, s := range state {
|
|
if s == "DEFAULT" {
|
|
isDefault = true
|
|
break
|
|
}
|
|
}
|
|
}
|
|
var nodes string
|
|
if pi.Nodes != nil {
|
|
nodes = derefStr(pi.Nodes.Configured)
|
|
}
|
|
var totalCPUs int32
|
|
if pi.CPUs != nil {
|
|
totalCPUs = derefInt32(pi.CPUs.Total)
|
|
}
|
|
var totalNodes int32
|
|
if pi.Nodes != nil {
|
|
totalNodes = derefInt32(pi.Nodes.Total)
|
|
}
|
|
var maxTime string
|
|
if pi.Maximums != nil {
|
|
maxTime = uint32NoValString(pi.Maximums.Time)
|
|
}
|
|
var maxNodes *int32
|
|
if pi.Maximums != nil {
|
|
maxNodes = mapUint32NoValToInt32(pi.Maximums.Nodes)
|
|
}
|
|
var maxCPUsPerNode *int32
|
|
if pi.Maximums != nil {
|
|
maxCPUsPerNode = mapUint32NoValToInt32(pi.Maximums.CpusPerNode)
|
|
}
|
|
var minNodes *int32
|
|
if pi.Minimums != nil {
|
|
minNodes = pi.Minimums.Nodes
|
|
}
|
|
var defaultTime string
|
|
if pi.Defaults != nil {
|
|
defaultTime = uint32NoValString(pi.Defaults.Time)
|
|
}
|
|
var graceTime *int32 = pi.GraceTime
|
|
var priority *int32
|
|
if pi.Priority != nil {
|
|
priority = pi.Priority.JobFactor
|
|
}
|
|
var qosAllowed, qosDeny, qosAssigned string
|
|
if pi.QOS != nil {
|
|
qosAllowed = derefStr(pi.QOS.Allowed)
|
|
qosDeny = derefStr(pi.QOS.Deny)
|
|
qosAssigned = derefStr(pi.QOS.Assigned)
|
|
}
|
|
var accountsAllowed, accountsDeny string
|
|
if pi.Accounts != nil {
|
|
accountsAllowed = derefStr(pi.Accounts.Allowed)
|
|
accountsDeny = derefStr(pi.Accounts.Deny)
|
|
}
|
|
return model.PartitionResponse{
|
|
Name: derefStr(pi.Name),
|
|
State: state,
|
|
Default: isDefault,
|
|
Nodes: nodes,
|
|
TotalNodes: totalNodes,
|
|
TotalCPUs: totalCPUs,
|
|
MaxTime: maxTime,
|
|
MaxNodes: maxNodes,
|
|
MaxCPUsPerNode: maxCPUsPerNode,
|
|
MinNodes: minNodes,
|
|
DefaultTime: defaultTime,
|
|
GraceTime: graceTime,
|
|
Priority: priority,
|
|
QOSAllowed: qosAllowed,
|
|
QOSDeny: qosDeny,
|
|
QOSAssigned: qosAssigned,
|
|
AccountsAllowed: accountsAllowed,
|
|
AccountsDeny: accountsDeny,
|
|
}
|
|
}
|