feat(service): add task defaults, job status, and cluster helpers
Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
@@ -122,13 +122,40 @@ func (s *TaskService) CreateTask(ctx context.Context, req *model.CreateTaskReque
|
||||
|
||||
// 8. Create task record
|
||||
task := &model.Task{
|
||||
TaskName: taskName,
|
||||
AppID: app.ID,
|
||||
AppName: app.Name,
|
||||
Status: model.TaskStatusSubmitted,
|
||||
Values: valuesJSON,
|
||||
InputFileIDs: fileIDsJSON,
|
||||
SubmittedAt: time.Now(),
|
||||
TaskName: taskName,
|
||||
AppID: app.ID,
|
||||
AppName: app.Name,
|
||||
Status: model.TaskStatusSubmitted,
|
||||
Values: valuesJSON,
|
||||
InputFileIDs: fileIDsJSON,
|
||||
SubmittedAt: time.Now(),
|
||||
Partition: derefStr(req.Partition),
|
||||
Cpus: req.Cpus,
|
||||
MemoryPerNode: req.MemoryPerNode,
|
||||
MemoryPerCpu: req.MemoryPerCpu,
|
||||
TimeLimit: req.TimeLimit,
|
||||
QOS: req.QOS,
|
||||
JobName: req.JobName,
|
||||
Nodes: req.Nodes,
|
||||
Tasks: req.Tasks,
|
||||
CpusPerTask: req.CpusPerTask,
|
||||
Constraints: req.Constraints,
|
||||
Reservation: req.Reservation,
|
||||
Account: req.Account,
|
||||
Nice: req.Nice,
|
||||
MailType: req.MailType,
|
||||
MailUser: req.MailUser,
|
||||
StandardOutput: req.StandardOutput,
|
||||
StandardError: req.StandardError,
|
||||
StandardInput: req.StandardInput,
|
||||
RequiredNodes: req.RequiredNodes,
|
||||
ExcludedNodes: req.ExcludedNodes,
|
||||
BeginTime: req.BeginTime,
|
||||
Deadline: req.Deadline,
|
||||
Array: req.Array,
|
||||
Dependency: req.Dependency,
|
||||
Requeue: req.Requeue,
|
||||
KillOnNodeFail: req.KillOnNodeFail,
|
||||
}
|
||||
|
||||
taskID, err := s.taskStore.Create(ctx, task)
|
||||
@@ -309,6 +336,17 @@ func (s *TaskService) ProcessTask(ctx context.Context, taskID int64) error {
|
||||
}
|
||||
}
|
||||
|
||||
// 注入默认调度参数(仅在内存中,不持久化到数据库)
|
||||
if task.TimeLimit == nil {
|
||||
task.TimeLimit = int32Ptr(10080) // 168 小时
|
||||
}
|
||||
if task.StandardOutput == nil {
|
||||
task.StandardOutput = strToPtrOrNil(filepath.Join(workDir, "slurm-%j.out"))
|
||||
}
|
||||
if task.StandardError == nil {
|
||||
task.StandardError = strToPtrOrNil(filepath.Join(workDir, "slurm-%j.err"))
|
||||
}
|
||||
|
||||
// 17. Render script
|
||||
rendered := RenderScript(app.ScriptTemplate, params, values)
|
||||
s.logger.Info("rendered script",
|
||||
@@ -319,8 +357,35 @@ func (s *TaskService) ProcessTask(ctx context.Context, taskID int64) error {
|
||||
|
||||
// 18. Submit to Slurm
|
||||
jobResp, err := s.jobSvc.SubmitJob(ctx, &model.SubmitJobRequest{
|
||||
Script: rendered,
|
||||
WorkDir: workDir,
|
||||
Script: rendered,
|
||||
WorkDir: workDir,
|
||||
Partition: task.Partition,
|
||||
CPUs: derefInt32(task.Cpus),
|
||||
TimeLimit: derefInt32ToStr(task.TimeLimit),
|
||||
QOS: derefStr(task.QOS),
|
||||
JobName: derefStr(task.JobName),
|
||||
MemoryPerNode: task.MemoryPerNode,
|
||||
MemoryPerCpu: task.MemoryPerCpu,
|
||||
Nodes: task.Nodes,
|
||||
Tasks: task.Tasks,
|
||||
CpusPerTask: task.CpusPerTask,
|
||||
Constraints: task.Constraints,
|
||||
Reservation: task.Reservation,
|
||||
Account: task.Account,
|
||||
Nice: task.Nice,
|
||||
MailType: task.MailType,
|
||||
MailUser: task.MailUser,
|
||||
StandardOutput: task.StandardOutput,
|
||||
StandardError: task.StandardError,
|
||||
StandardInput: task.StandardInput,
|
||||
RequiredNodes: task.RequiredNodes,
|
||||
ExcludedNodes: task.ExcludedNodes,
|
||||
BeginTime: task.BeginTime,
|
||||
Deadline: task.Deadline,
|
||||
Array: task.Array,
|
||||
Dependency: task.Dependency,
|
||||
Requeue: task.Requeue,
|
||||
KillOnNodeFail: task.KillOnNodeFail,
|
||||
})
|
||||
if err != nil {
|
||||
return fail(model.TaskStepSubmitting, fmt.Sprintf("submit job: %v", err))
|
||||
|
||||
Reference in New Issue
Block a user