fix(task): prevent RecoverStuckTasks from re-enqueueing in-flight tasks
RecoverStuckTasks scans for tasks with updated_at > 5min ago and re-enqueues them. This incorrectly matched tasks actively being processed by the worker (e.g. slow downloads), causing double-processing. Add inflight sync.Map to track taskIDs currently inside ProcessTask. RecoverStuckTasks skips tasks found in inflight. On server restart inflight is empty (in-memory), so genuinely stuck tasks are still correctly recovered. Also: increase taskCh buffer 16→10000, add periodic RecoverStuckTasks goroutine in TaskPoller (every 5min), and add status guard in ProcessTask as defense-in-depth against duplicate enqueues.
This commit is contained in:
@@ -8,12 +8,15 @@ import (
|
||||
"go.uber.org/zap"
|
||||
)
|
||||
|
||||
// TaskPollable defines the interface for refreshing stale task statuses.
|
||||
// TaskPollable defines the interface for refreshing stale task statuses
|
||||
// and recovering stuck tasks.
|
||||
type TaskPollable interface {
|
||||
RefreshStaleTasks(ctx context.Context) error
|
||||
RecoverStuckTasks(ctx context.Context)
|
||||
}
|
||||
|
||||
// TaskPoller periodically polls Slurm for task status updates via TaskPollable.
|
||||
// TaskPoller periodically polls Slurm for task status updates and recovers
|
||||
// stuck tasks via TaskPollable.
|
||||
type TaskPoller struct {
|
||||
taskSvc TaskPollable
|
||||
interval time.Duration
|
||||
@@ -31,9 +34,11 @@ func NewTaskPoller(taskSvc TaskPollable, interval time.Duration, logger *zap.Log
|
||||
}
|
||||
}
|
||||
|
||||
// Start launches the background goroutine that periodically refreshes stale tasks.
|
||||
// Start launches background goroutines that periodically refresh stale tasks
|
||||
// and recover stuck tasks.
|
||||
func (p *TaskPoller) Start(ctx context.Context) {
|
||||
ctx, p.cancel = context.WithCancel(ctx)
|
||||
|
||||
p.wg.Add(1)
|
||||
go func() {
|
||||
defer p.wg.Done()
|
||||
@@ -50,9 +55,24 @@ func (p *TaskPoller) Start(ctx context.Context) {
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
p.wg.Add(1)
|
||||
go func() {
|
||||
defer p.wg.Done()
|
||||
ticker := time.NewTicker(5 * time.Minute)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-ticker.C:
|
||||
p.taskSvc.RecoverStuckTasks(ctx)
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
// Stop cancels the background goroutine and waits for it to finish.
|
||||
// Stop cancels the background goroutines and waits for them to finish.
|
||||
func (p *TaskPoller) Stop() {
|
||||
if p.cancel != nil {
|
||||
p.cancel()
|
||||
|
||||
@@ -25,6 +25,8 @@ func (m *mockTaskPollable) RefreshStaleTasks(ctx context.Context) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *mockTaskPollable) RecoverStuckTasks(ctx context.Context) {}
|
||||
|
||||
func (m *mockTaskPollable) getCallCount() int {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
|
||||
Reference in New Issue
Block a user