- JobService: 提交、查询、取消、历史记录,记录关键操作日志 - ClusterService: 节点、分区、诊断查询,记录错误日志 - NewSlurmClient: JWT 认证 HTTP 客户端工厂 - 所有构造函数接受 *zap.Logger 参数实现依赖注入 - 提交/取消成功记录 Info,API 错误记录 Error - 完整 TDD 测试,使用 zaptest/observer 验证日志输出 Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
168 lines
4.0 KiB
Go
168 lines
4.0 KiB
Go
package service
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"strconv"
|
|
|
|
"gcy_hpc_server/internal/model"
|
|
"gcy_hpc_server/internal/slurm"
|
|
|
|
"go.uber.org/zap"
|
|
)
|
|
|
|
func derefStr(s *string) string {
|
|
if s == nil {
|
|
return ""
|
|
}
|
|
return *s
|
|
}
|
|
|
|
func derefInt32(i *int32) int32 {
|
|
if i == nil {
|
|
return 0
|
|
}
|
|
return *i
|
|
}
|
|
|
|
func derefInt64(i *int64) int64 {
|
|
if i == nil {
|
|
return 0
|
|
}
|
|
return *i
|
|
}
|
|
|
|
func uint32NoValString(v *slurm.Uint32NoVal) string {
|
|
if v == nil {
|
|
return ""
|
|
}
|
|
if v.Infinite != nil && *v.Infinite {
|
|
return "UNLIMITED"
|
|
}
|
|
if v.Number != nil {
|
|
return strconv.FormatInt(*v.Number, 10)
|
|
}
|
|
return ""
|
|
}
|
|
|
|
type ClusterService struct {
|
|
client *slurm.Client
|
|
logger *zap.Logger
|
|
}
|
|
|
|
func NewClusterService(client *slurm.Client, logger *zap.Logger) *ClusterService {
|
|
return &ClusterService{client: client, logger: logger}
|
|
}
|
|
|
|
func (s *ClusterService) GetNodes(ctx context.Context) ([]model.NodeResponse, error) {
|
|
resp, _, err := s.client.Nodes.GetNodes(ctx, nil)
|
|
if err != nil {
|
|
s.logger.Error("failed to get nodes", zap.Error(err))
|
|
return nil, fmt.Errorf("get nodes: %w", err)
|
|
}
|
|
if resp.Nodes == nil {
|
|
return nil, nil
|
|
}
|
|
result := make([]model.NodeResponse, 0, len(*resp.Nodes))
|
|
for _, n := range *resp.Nodes {
|
|
result = append(result, mapNode(n))
|
|
}
|
|
return result, nil
|
|
}
|
|
|
|
func (s *ClusterService) GetNode(ctx context.Context, name string) (*model.NodeResponse, error) {
|
|
resp, _, err := s.client.Nodes.GetNode(ctx, name, nil)
|
|
if err != nil {
|
|
s.logger.Error("failed to get node", zap.String("name", name), zap.Error(err))
|
|
return nil, fmt.Errorf("get node %s: %w", name, err)
|
|
}
|
|
if resp.Nodes == nil || len(*resp.Nodes) == 0 {
|
|
return nil, nil
|
|
}
|
|
n := (*resp.Nodes)[0]
|
|
mapped := mapNode(n)
|
|
return &mapped, nil
|
|
}
|
|
|
|
func (s *ClusterService) GetPartitions(ctx context.Context) ([]model.PartitionResponse, error) {
|
|
resp, _, err := s.client.Partitions.GetPartitions(ctx, nil)
|
|
if err != nil {
|
|
s.logger.Error("failed to get partitions", zap.Error(err))
|
|
return nil, fmt.Errorf("get partitions: %w", err)
|
|
}
|
|
if resp.Partitions == nil {
|
|
return nil, nil
|
|
}
|
|
result := make([]model.PartitionResponse, 0, len(*resp.Partitions))
|
|
for _, pi := range *resp.Partitions {
|
|
result = append(result, mapPartition(pi))
|
|
}
|
|
return result, nil
|
|
}
|
|
|
|
func (s *ClusterService) GetPartition(ctx context.Context, name string) (*model.PartitionResponse, error) {
|
|
resp, _, err := s.client.Partitions.GetPartition(ctx, name, nil)
|
|
if err != nil {
|
|
s.logger.Error("failed to get partition", zap.String("name", name), zap.Error(err))
|
|
return nil, fmt.Errorf("get partition %s: %w", name, err)
|
|
}
|
|
if resp.Partitions == nil || len(*resp.Partitions) == 0 {
|
|
return nil, nil
|
|
}
|
|
p := (*resp.Partitions)[0]
|
|
mapped := mapPartition(p)
|
|
return &mapped, nil
|
|
}
|
|
|
|
func (s *ClusterService) GetDiag(ctx context.Context) (*slurm.OpenapiDiagResp, error) {
|
|
resp, _, err := s.client.Diag.GetDiag(ctx)
|
|
if err != nil {
|
|
s.logger.Error("failed to get diag", zap.Error(err))
|
|
return nil, fmt.Errorf("get diag: %w", err)
|
|
}
|
|
return resp, nil
|
|
}
|
|
|
|
func mapNode(n slurm.Node) model.NodeResponse {
|
|
return model.NodeResponse{
|
|
Name: derefStr(n.Name),
|
|
State: n.State,
|
|
CPUs: derefInt32(n.Cpus),
|
|
RealMemory: derefInt64(n.RealMemory),
|
|
AllocMem: derefInt64(n.AllocMemory),
|
|
Arch: derefStr(n.Architecture),
|
|
OS: derefStr(n.OperatingSystem),
|
|
}
|
|
}
|
|
|
|
func mapPartition(pi slurm.PartitionInfo) model.PartitionResponse {
|
|
var state []string
|
|
if pi.Partition != nil {
|
|
state = pi.Partition.State
|
|
}
|
|
var nodes string
|
|
if pi.Nodes != nil {
|
|
nodes = derefStr(pi.Nodes.Configured)
|
|
}
|
|
var totalCPUs int32
|
|
if pi.CPUs != nil {
|
|
totalCPUs = derefInt32(pi.CPUs.Total)
|
|
}
|
|
var totalNodes int32
|
|
if pi.Nodes != nil {
|
|
totalNodes = derefInt32(pi.Nodes.Total)
|
|
}
|
|
var maxTime string
|
|
if pi.Maximums != nil {
|
|
maxTime = uint32NoValString(pi.Maximums.Time)
|
|
}
|
|
return model.PartitionResponse{
|
|
Name: derefStr(pi.Name),
|
|
State: state,
|
|
Nodes: nodes,
|
|
TotalCPUs: totalCPUs,
|
|
TotalNodes: totalNodes,
|
|
MaxTime: maxTime,
|
|
}
|
|
}
|