RecoverStuckTasks scans for tasks with updated_at > 5min ago and re-enqueues them. This incorrectly matched tasks actively being processed by the worker (e.g. slow downloads), causing double-processing. Add inflight sync.Map to track taskIDs currently inside ProcessTask. RecoverStuckTasks skips tasks found in inflight. On server restart inflight is empty (in-memory), so genuinely stuck tasks are still correctly recovered. Also: increase taskCh buffer 16→10000, add periodic RecoverStuckTasks goroutine in TaskPoller (every 5min), and add status guard in ProcessTask as defense-in-depth against duplicate enqueues.
82 lines
1.7 KiB
Go
82 lines
1.7 KiB
Go
package app
|
|
|
|
import (
|
|
"context"
|
|
"sync"
|
|
"time"
|
|
|
|
"go.uber.org/zap"
|
|
)
|
|
|
|
// TaskPollable defines the interface for refreshing stale task statuses
|
|
// and recovering stuck tasks.
|
|
type TaskPollable interface {
|
|
RefreshStaleTasks(ctx context.Context) error
|
|
RecoverStuckTasks(ctx context.Context)
|
|
}
|
|
|
|
// TaskPoller periodically polls Slurm for task status updates and recovers
|
|
// stuck tasks via TaskPollable.
|
|
type TaskPoller struct {
|
|
taskSvc TaskPollable
|
|
interval time.Duration
|
|
cancel context.CancelFunc
|
|
wg sync.WaitGroup
|
|
logger *zap.Logger
|
|
}
|
|
|
|
// NewTaskPoller creates a new TaskPoller with the given service, interval, and logger.
|
|
func NewTaskPoller(taskSvc TaskPollable, interval time.Duration, logger *zap.Logger) *TaskPoller {
|
|
return &TaskPoller{
|
|
taskSvc: taskSvc,
|
|
interval: interval,
|
|
logger: logger,
|
|
}
|
|
}
|
|
|
|
// Start launches background goroutines that periodically refresh stale tasks
|
|
// and recover stuck tasks.
|
|
func (p *TaskPoller) Start(ctx context.Context) {
|
|
ctx, p.cancel = context.WithCancel(ctx)
|
|
|
|
p.wg.Add(1)
|
|
go func() {
|
|
defer p.wg.Done()
|
|
ticker := time.NewTicker(p.interval)
|
|
defer ticker.Stop()
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-ticker.C:
|
|
if err := p.taskSvc.RefreshStaleTasks(ctx); err != nil {
|
|
p.logger.Error("failed to refresh stale tasks", zap.Error(err))
|
|
}
|
|
}
|
|
}
|
|
}()
|
|
|
|
p.wg.Add(1)
|
|
go func() {
|
|
defer p.wg.Done()
|
|
ticker := time.NewTicker(5 * time.Minute)
|
|
defer ticker.Stop()
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-ticker.C:
|
|
p.taskSvc.RecoverStuckTasks(ctx)
|
|
}
|
|
}
|
|
}()
|
|
}
|
|
|
|
// Stop cancels the background goroutines and waits for them to finish.
|
|
func (p *TaskPoller) Stop() {
|
|
if p.cancel != nil {
|
|
p.cancel()
|
|
}
|
|
p.wg.Wait()
|
|
}
|