Improve queue and logger context (#24924)

Before there was a "graceful function": RunWithShutdownFns, it's mainly for some modules which doesn't support context. The old queue system doesn't work well with context, so the old queues need it. After the queue refactoring, the new queue works with context well, so, use Golang context as much as possible, the `RunWithShutdownFns` could be removed (replaced by RunWithCancel for context cancel mechanism), the related code could be simplified. This PR also fixes some legacy queue-init problems, eg: * typo : archiver: "unable to create codes indexer queue" => "unable to create repo-archive queue" * no nil check for failed queues, which causes unfriendly panic After this PR, many goroutines could have better display name: ![image](701b2a9b-8065-4137-aeaa-0bda2b34604a) ![image](f1d5f50f-0534-40f0-b0be-f2c9daa5fe92)
2023-05-26 15:31:55 +08:00 · 2023-05-26 15:31:55 +08:00 · 18f26cfbf7
commit 18f26cfbf7
parent e4922d484b
31 changed files with 204 additions and 263 deletions
--- a/modules/queue/manager.go
+++ b/modules/queue/manager.go
@ -88,22 +88,22 @@ func (m *Manager) FlushAll(ctx context.Context, timeout time.Duration) error {
 }

 // CreateSimpleQueue creates a simple queue from global setting config provider by name
-func CreateSimpleQueue[T any](name string, handler HandlerFuncT[T]) *WorkerPoolQueue[T] {
-	return createWorkerPoolQueue(name, setting.CfgProvider, handler, false)
+func CreateSimpleQueue[T any](ctx context.Context, name string, handler HandlerFuncT[T]) *WorkerPoolQueue[T] {
+	return createWorkerPoolQueue(ctx, name, setting.CfgProvider, handler, false)
 }

 // CreateUniqueQueue creates a unique queue from global setting config provider by name
-func CreateUniqueQueue[T any](name string, handler HandlerFuncT[T]) *WorkerPoolQueue[T] {
-	return createWorkerPoolQueue(name, setting.CfgProvider, handler, true)
+func CreateUniqueQueue[T any](ctx context.Context, name string, handler HandlerFuncT[T]) *WorkerPoolQueue[T] {
+	return createWorkerPoolQueue(ctx, name, setting.CfgProvider, handler, true)
 }

-func createWorkerPoolQueue[T any](name string, cfgProvider setting.ConfigProvider, handler HandlerFuncT[T], unique bool) *WorkerPoolQueue[T] {
+func createWorkerPoolQueue[T any](ctx context.Context, name string, cfgProvider setting.ConfigProvider, handler HandlerFuncT[T], unique bool) *WorkerPoolQueue[T] {
 	queueSetting, err := setting.GetQueueSettings(cfgProvider, name)
 	if err != nil {
 		log.Error("Failed to get queue settings for %q: %v", name, err)
 		return nil
 	}
-	w, err := NewWorkerPoolQueueBySetting(name, queueSetting, handler, unique)
+	w, err := NewWorkerPoolQueueWithContext(ctx, name, queueSetting, handler, unique)
 	if err != nil {
 		log.Error("Failed to create queue %q: %v", name, err)
 		return nil
--- a/modules/queue/manager_test.go
+++ b/modules/queue/manager_test.go
@ -29,7 +29,7 @@ func TestManager(t *testing.T) {
 		if err != nil {
 			return nil, err
 		}
-		return NewWorkerPoolQueueBySetting(name, qs, func(s ...int) (unhandled []int) { return nil }, false)
+		return newWorkerPoolQueueForTest(name, qs, func(s ...int) (unhandled []int) { return nil }, false)
 	}

 	// test invalid CONN_STR
@ -80,7 +80,7 @@ MAX_WORKERS = 2

 	assert.NoError(t, err)

-	q1 := createWorkerPoolQueue[string]("no-such", cfgProvider, nil, false)
+	q1 := createWorkerPoolQueue[string](context.Background(), "no-such", cfgProvider, nil, false)
 	assert.Equal(t, "no-such", q1.GetName())
 	assert.Equal(t, "dummy", q1.GetType()) // no handler, so it becomes dummy
 	assert.Equal(t, filepath.Join(setting.AppDataPath, "queues/dir1"), q1.baseConfig.DataFullDir)
@ -96,7 +96,7 @@ MAX_WORKERS = 2
 	assert.Equal(t, "string", q1.GetItemTypeName())
 	qid1 := GetManager().qidCounter

-	q2 := createWorkerPoolQueue("sub", cfgProvider, func(s ...int) (unhandled []int) { return nil }, false)
+	q2 := createWorkerPoolQueue(context.Background(), "sub", cfgProvider, func(s ...int) (unhandled []int) { return nil }, false)
 	assert.Equal(t, "sub", q2.GetName())
 	assert.Equal(t, "level", q2.GetType())
 	assert.Equal(t, filepath.Join(setting.AppDataPath, "queues/dir2"), q2.baseConfig.DataFullDir)
--- a/modules/queue/workergroup.go
+++ b/modules/queue/workergroup.go
@ -5,6 +5,7 @@ package queue

 import (
 	"context"
+	"runtime/pprof"
 	"sync"
 	"sync/atomic"
 	"time"
@ -13,9 +14,10 @@ import (
 )

 var (
-	infiniteTimerC        = make(chan time.Time)
-	batchDebounceDuration = 100 * time.Millisecond
-	workerIdleDuration    = 1 * time.Second
+	infiniteTimerC         = make(chan time.Time)
+	batchDebounceDuration  = 100 * time.Millisecond
+	workerIdleDuration     = 1 * time.Second
+	shutdownDefaultTimeout = 2 * time.Second

 	unhandledItemRequeueDuration atomic.Int64 // to avoid data race during test
 )
@ -116,13 +118,15 @@ func (q *WorkerPoolQueue[T]) doWorkerHandle(batch []T) {
 // If the queue is shutting down, it returns true and try to push the items
 // Otherwise it does nothing and returns false
 func (q *WorkerPoolQueue[T]) basePushForShutdown(items ...T) bool {
-	ctxShutdown := q.ctxShutdown.Load()
-	if ctxShutdown == nil {
+	shutdownTimeout := time.Duration(q.shutdownTimeout.Load())
+	if shutdownTimeout == 0 {
 		return false
 	}
+	ctxShutdown, ctxShutdownCancel := context.WithTimeout(context.Background(), shutdownTimeout)
+	defer ctxShutdownCancel()
 	for _, item := range items {
 		// if there is still any error, the queue can do nothing instead of losing the items
-		if err := q.baseQueue.PushItem(*ctxShutdown, q.marshal(item)); err != nil {
+		if err := q.baseQueue.PushItem(ctxShutdown, q.marshal(item)); err != nil {
 			log.Error("Failed to requeue item for queue %q when shutting down: %v", q.GetName(), err)
 		}
 	}
@ -246,6 +250,8 @@ var skipFlushChan = make(chan flushType) // an empty flush chan, used to skip re

 // doRun is the main loop of the queue. All related "doXxx" functions are executed in its context.
 func (q *WorkerPoolQueue[T]) doRun() {
+	pprof.SetGoroutineLabels(q.ctxRun)
+
 	log.Debug("Queue %q starts running", q.GetName())
 	defer log.Debug("Queue %q stops running", q.GetName())

@ -271,8 +277,8 @@ func (q *WorkerPoolQueue[T]) doRun() {
 			}
 		}

-		ctxShutdownPtr := q.ctxShutdown.Load()
-		if ctxShutdownPtr != nil {
+		shutdownTimeout := time.Duration(q.shutdownTimeout.Load())
+		if shutdownTimeout != 0 {
 			// if there is a shutdown context, try to push the items back to the base queue
 			q.basePushForShutdown(unhandled...)
 			workerDone := make(chan struct{})
@ -280,7 +286,7 @@ func (q *WorkerPoolQueue[T]) doRun() {
 			go func() { wg.wg.Wait(); close(workerDone) }()
 			select {
 			case <-workerDone:
-			case <-(*ctxShutdownPtr).Done():
+			case <-time.After(shutdownTimeout):
 				log.Error("Queue %q is shutting down, but workers are still running after timeout", q.GetName())
 			}
 		} else {
--- a/modules/queue/workerqueue.go
+++ b/modules/queue/workerqueue.go
@ -10,9 +10,9 @@ import (
 	"sync/atomic"
 	"time"

-	"code.gitea.io/gitea/modules/graceful"
 	"code.gitea.io/gitea/modules/json"
 	"code.gitea.io/gitea/modules/log"
+	"code.gitea.io/gitea/modules/process"
 	"code.gitea.io/gitea/modules/setting"
 )

@ -21,8 +21,9 @@ import (
 type WorkerPoolQueue[T any] struct {
 	ctxRun       context.Context
 	ctxRunCancel context.CancelFunc
-	ctxShutdown  atomic.Pointer[context.Context]
-	shutdownDone chan struct{}
+
+	shutdownDone    chan struct{}
+	shutdownTimeout atomic.Int64 // in case some buggy handlers (workers) would hang forever, "shutdown" should finish in predictable time

 	origHandler HandlerFuncT[T]
 	safeHandler HandlerFuncT[T]
@ -175,22 +176,19 @@ func (q *WorkerPoolQueue[T]) Has(data T) (bool, error) {
 	return q.baseQueue.HasItem(q.ctxRun, q.marshal(data))
 }

-func (q *WorkerPoolQueue[T]) Run(atShutdown, atTerminate func(func())) {
-	atShutdown(func() {
-		// in case some queue handlers are slow or have hanging bugs, at most wait for a short time
-		q.ShutdownWait(1 * time.Second)
-	})
+func (q *WorkerPoolQueue[T]) Run() {
 	q.doRun()
 }

+func (q *WorkerPoolQueue[T]) Cancel() {
+	q.ctxRunCancel()
+}
+
 // ShutdownWait shuts down the queue, waits for all workers to finish their jobs, and pushes the unhandled items back to the base queue
 // It waits for all workers (handlers) to finish their jobs, in case some buggy handlers would hang forever, a reasonable timeout is needed
 func (q *WorkerPoolQueue[T]) ShutdownWait(timeout time.Duration) {
-	shutdownCtx, shutdownCtxCancel := context.WithTimeout(context.Background(), timeout)
-	defer shutdownCtxCancel()
-	if q.ctxShutdown.CompareAndSwap(nil, &shutdownCtx) {
-		q.ctxRunCancel()
-	}
+	q.shutdownTimeout.Store(int64(timeout))
+	q.ctxRunCancel()
 	<-q.shutdownDone
 }

@ -207,7 +205,11 @@ func getNewQueueFn(t string) (string, func(cfg *BaseConfig, unique bool) (baseQu
 	}
 }

-func NewWorkerPoolQueueBySetting[T any](name string, queueSetting setting.QueueSettings, handler HandlerFuncT[T], unique bool) (*WorkerPoolQueue[T], error) {
+func newWorkerPoolQueueForTest[T any](name string, queueSetting setting.QueueSettings, handler HandlerFuncT[T], unique bool) (*WorkerPoolQueue[T], error) {
+	return NewWorkerPoolQueueWithContext(context.Background(), name, queueSetting, handler, unique)
+}
+
+func NewWorkerPoolQueueWithContext[T any](ctx context.Context, name string, queueSetting setting.QueueSettings, handler HandlerFuncT[T], unique bool) (*WorkerPoolQueue[T], error) {
 	if handler == nil {
 		log.Debug("Use dummy queue for %q because handler is nil and caller doesn't want to process the queue items", name)
 		queueSetting.Type = "dummy"
@ -224,10 +226,11 @@ func NewWorkerPoolQueueBySetting[T any](name string, queueSetting setting.QueueS
 	}
 	log.Trace("Created queue %q of type %q", name, queueType)

-	w.ctxRun, w.ctxRunCancel = context.WithCancel(graceful.GetManager().ShutdownContext())
+	w.ctxRun, _, w.ctxRunCancel = process.GetManager().AddTypedContext(ctx, "Queue: "+w.GetName(), process.SystemProcessType, false)
 	w.batchChan = make(chan []T)
 	w.flushChan = make(chan flushType)
 	w.shutdownDone = make(chan struct{})
+	w.shutdownTimeout.Store(int64(shutdownDefaultTimeout))
 	w.workerMaxNum = queueSetting.MaxWorkers
 	w.batchLength = queueSetting.BatchLength

--- a/modules/queue/workerqueue_test.go
+++ b/modules/queue/workerqueue_test.go
@ -16,17 +16,9 @@ import (
 )

 func runWorkerPoolQueue[T any](q *WorkerPoolQueue[T]) func() {
-	var stop func()
-	started := make(chan struct{})
-	stopped := make(chan struct{})
-	go func() {
-		q.Run(func(f func()) { stop = f; close(started) }, nil)
-		close(stopped)
-	}()
-	<-started
+	go q.Run()
 	return func() {
-		stop()
-		<-stopped
+		q.ShutdownWait(1 * time.Second)
 	}
 }

@ -57,7 +49,7 @@ func TestWorkerPoolQueueUnhandled(t *testing.T) {
 			return unhandled
 		}

-		q, _ := NewWorkerPoolQueueBySetting("test-workpoolqueue", queueSetting, handler, false)
+		q, _ := newWorkerPoolQueueForTest("test-workpoolqueue", queueSetting, handler, false)
 		stop := runWorkerPoolQueue(q)
 		for i := 0; i < queueSetting.Length; i++ {
 			testRecorder.Record("push:%v", i)
@ -145,7 +137,7 @@ func testWorkerPoolQueuePersistence(t *testing.T, queueSetting setting.QueueSett
 			return nil
 		}

-		q, _ := NewWorkerPoolQueueBySetting("pr_patch_checker_test", queueSetting, testHandler, true)
+		q, _ := newWorkerPoolQueueForTest("pr_patch_checker_test", queueSetting, testHandler, true)
 		stop := runWorkerPoolQueue(q)
 		for i := 0; i < testCount; i++ {
 			_ = q.Push("task-" + strconv.Itoa(i))
@ -169,7 +161,7 @@ func testWorkerPoolQueuePersistence(t *testing.T, queueSetting setting.QueueSett
 			return nil
 		}

-		q, _ := NewWorkerPoolQueueBySetting("pr_patch_checker_test", queueSetting, testHandler, true)
+		q, _ := newWorkerPoolQueueForTest("pr_patch_checker_test", queueSetting, testHandler, true)
 		stop := runWorkerPoolQueue(q)
 		assert.NoError(t, q.FlushWithContext(context.Background(), 0))
 		stop()
@ -194,7 +186,7 @@ func TestWorkerPoolQueueActiveWorkers(t *testing.T) {
 		return nil
 	}

-	q, _ := NewWorkerPoolQueueBySetting("test-workpoolqueue", setting.QueueSettings{Type: "channel", BatchLength: 1, MaxWorkers: 1, Length: 100}, handler, false)
+	q, _ := newWorkerPoolQueueForTest("test-workpoolqueue", setting.QueueSettings{Type: "channel", BatchLength: 1, MaxWorkers: 1, Length: 100}, handler, false)
 	stop := runWorkerPoolQueue(q)
 	for i := 0; i < 5; i++ {
 		assert.NoError(t, q.Push(i))
@ -210,7 +202,7 @@ func TestWorkerPoolQueueActiveWorkers(t *testing.T) {
 	assert.EqualValues(t, 1, q.GetWorkerNumber()) // there is at least one worker after the queue begins working
 	stop()

-	q, _ = NewWorkerPoolQueueBySetting("test-workpoolqueue", setting.QueueSettings{Type: "channel", BatchLength: 1, MaxWorkers: 3, Length: 100}, handler, false)
+	q, _ = newWorkerPoolQueueForTest("test-workpoolqueue", setting.QueueSettings{Type: "channel", BatchLength: 1, MaxWorkers: 3, Length: 100}, handler, false)
 	stop = runWorkerPoolQueue(q)
 	for i := 0; i < 15; i++ {
 		assert.NoError(t, q.Push(i))
@ -238,23 +230,23 @@ func TestWorkerPoolQueueShutdown(t *testing.T) {
 		if items[0] == 0 {
 			close(handlerCalled)
 		}
-		time.Sleep(100 * time.Millisecond)
+		time.Sleep(400 * time.Millisecond)
 		return items
 	}

 	qs := setting.QueueSettings{Type: "level", Datadir: t.TempDir() + "/queue", BatchLength: 3, MaxWorkers: 4, Length: 20}
-	q, _ := NewWorkerPoolQueueBySetting("test-workpoolqueue", qs, handler, false)
+	q, _ := newWorkerPoolQueueForTest("test-workpoolqueue", qs, handler, false)
 	stop := runWorkerPoolQueue(q)
 	for i := 0; i < qs.Length; i++ {
 		assert.NoError(t, q.Push(i))
 	}
 	<-handlerCalled
-	time.Sleep(50 * time.Millisecond) // wait for a while to make sure all workers are active
+	time.Sleep(200 * time.Millisecond) // wait for a while to make sure all workers are active
 	assert.EqualValues(t, 4, q.GetWorkerActiveNumber())
 	stop() // stop triggers shutdown
 	assert.EqualValues(t, 0, q.GetWorkerActiveNumber())

 	// no item was ever handled, so we still get all of them again
-	q, _ = NewWorkerPoolQueueBySetting("test-workpoolqueue", qs, handler, false)
+	q, _ = newWorkerPoolQueueForTest("test-workpoolqueue", qs, handler, false)
 	assert.EqualValues(t, 20, q.GetQueueItemNumber())
 }