Search PoC: Improves initial indexing speed. Makes params configurable. (#95439)

* Improves initial indexing speed. Makes params configurable. * fix linter errors * removes kind param * updates index test * remove println from test * removes error check in test * adds log for high index latency ands updates max goroutine var with workers config var * fix test timing out - set worker limit * set the batch size --------- Co-authored-by: Scott Lepper <scott.lepper@gmail.com>
2025-02-25 18:55:37 -06:00 · 2024-10-29 12:24:31 -06:00 · 2024-10-29 12:24:31 -06:00 · 995128d1db
commit 995128d1db
parent 189802d3c3
5 changed files with 137 additions and 54 deletions
--- a/pkg/setting/setting.go
+++ b/pkg/setting/setting.go
@ -532,8 +532,11 @@ type Cfg struct {
 	ShortLinkExpiration int

 	// Unified Storage
-	UnifiedStorage map[string]UnifiedStorageConfig
-	IndexPath      string
+	UnifiedStorage    map[string]UnifiedStorageConfig
+	IndexPath         string
+	IndexWorkers      int
+	IndexMaxBatchSize int
+	IndexListLimit    int
 }

 type UnifiedStorageConfig struct {
@ -1343,7 +1346,6 @@ func (cfg *Cfg) parseINIFile(iniFile *ini.File) error {

 	// unified storage config
 	cfg.setUnifiedStorageConfig()
-	cfg.setIndexPath()

 	return nil
 }
--- a/pkg/setting/setting_unified_storage.go
+++ b/pkg/setting/setting_unified_storage.go
@ -35,8 +35,11 @@ func (cfg *Cfg) setUnifiedStorageConfig() {
 		}
 	}
 	cfg.UnifiedStorage = storageConfig
-}

-func (cfg *Cfg) setIndexPath() {
-	cfg.IndexPath = cfg.Raw.Section("unified_storage").Key("index_path").String()
+	// Set indexer config for unified storaae
+	section := cfg.Raw.Section("unified_storage")
+	cfg.IndexPath = section.Key("index_path").String()
+	cfg.IndexWorkers = section.Key("index_workers").MustInt(10)
+	cfg.IndexMaxBatchSize = section.Key("index_max_batch_size").MustInt(100)
+	cfg.IndexListLimit = section.Key("index_list_limit").MustInt(1000)
 }
--- a/pkg/storage/unified/resource/index.go
+++ b/pkg/storage/unified/resource/index.go
@ -13,10 +13,10 @@ import (
 	"github.com/grafana/grafana/pkg/infra/tracing"
 	"go.opentelemetry.io/otel/attribute"
 	"go.opentelemetry.io/otel/trace"
+	"golang.org/x/sync/errgroup"
 )

 const tracingPrexfixIndex = "unified_storage.index."
-const pageSize = 10000

 type Shard struct {
 	index bleve.Index
@ -30,12 +30,11 @@ type Index struct {
 	s      *server
 	log    log.Logger
 	tracer tracing.Tracer
-	path   string
 }

-func NewIndex(s *server, opts Opts, path string, tracer tracing.Tracer) *Index {
-	if path == "" {
-		path = os.TempDir()
+func NewIndex(s *server, opts Opts, tracer tracing.Tracer) *Index {
+	if opts.IndexDir == "" {
+		opts.IndexDir = os.TempDir()
 	}

 	idx := &Index{
@ -44,46 +43,88 @@ func NewIndex(s *server, opts Opts, path string, tracer tracing.Tracer) *Index {
 		shards: make(map[string]Shard),
 		log:    log.New("unifiedstorage.search.index"),
 		tracer: tracer,
-		path:   path,
 	}

 	return idx
 }

-func (i *Index) IndexBatch(ctx context.Context, list *ListResponse) error {
-	ctx, span := i.tracer.Start(ctx, tracingPrexfixIndex+"CreateIndexBatches")
-	for _, obj := range list.Items {
-		indexableResource, err := NewIndexedResource(obj.Value)
-		if err != nil {
-			return err
-		}
-
-		shard, err := i.getShard(indexableResource.Namespace)
-		if err != nil {
-			return err
-		}
-		i.log.Debug("indexing resource in batch", "batch_count", len(list.Items), "tenant", indexableResource.Namespace)
-
-		err = shard.batch.Index(indexableResource.Uid, indexableResource)
-		if err != nil {
-			return err
-		}
-	}
-	span.End()
-
-	_, span = i.tracer.Start(ctx, tracingPrexfixIndex+"IndexBatches")
+// IndexBatches goes through all the shards and indexes their batches if they are large enough
+func (i *Index) IndexBatches(ctx context.Context, maxSize int, tenants []string) error {
+	_, span := i.tracer.Start(ctx, tracingPrexfixIndex+"IndexBatches")
 	defer span.End()
-	for _, shard := range i.shards {
-		err := shard.index.Batch(shard.batch)
+
+	group := errgroup.Group{}
+	group.SetLimit(i.opts.Workers)
+	totalBatchesIndexed := 0
+
+	for _, tenant := range tenants {
+		shard, err := i.getShard(tenant)
 		if err != nil {
 			return err
 		}
-		shard.batch.Reset()
+		// Index the batch if it is large enough
+		if shard.batch.Size() >= maxSize {
+			totalBatchesIndexed++
+			group.Go(func() error {
+				i.log.Debug("indexing batch for shard", "tenant", tenant, "size", shard.batch.Size())
+				err = shard.index.Batch(shard.batch)
+				if err != nil {
+					return err
+				}
+				shard.batch.Reset()
+				return nil
+			})
+		}
 	}

+	err := group.Wait()
+	if err != nil {
+		return err
+	}
+
+	span.AddEvent("batches indexed", trace.WithAttributes(attribute.Int("batches_indexed", totalBatchesIndexed)))
+
 	return nil
 }

+// AddToBatches adds resources to their respective shard's batch
+// returns a list of tenants that have changes
+func (i *Index) AddToBatches(ctx context.Context, list *ListResponse) ([]string, error) {
+	_, span := i.tracer.Start(ctx, tracingPrexfixIndex+"AddToBatches")
+	defer span.End()
+
+	tenantsWithChanges := make(map[string]bool)
+	for _, obj := range list.Items {
+		// Transform the raw resource into a more generic indexable resource
+		res, err := NewIndexedResource(obj.Value)
+		if err != nil {
+			return nil, err
+		}
+
+		shard, err := i.getShard(res.Namespace)
+		if err != nil {
+			return nil, err
+		}
+		i.log.Debug("indexing resource in batch", "batch_count", len(list.Items), "kind", res.Kind, "tenant", res.Namespace)
+
+		err = shard.batch.Index(res.Uid, res)
+		if err != nil {
+			return nil, err
+		}
+
+		if _, ok := tenantsWithChanges[res.Namespace]; !ok {
+			tenantsWithChanges[res.Namespace] = true
+		}
+	}
+
+	tenants := make([]string, 0, len(tenantsWithChanges))
+	for tenant, _ := range tenantsWithChanges {
+		tenants = append(tenants, tenant)
+	}
+
+	return tenants, nil
+}
+
 func (i *Index) Init(ctx context.Context) error {
 	ctx, span := i.tracer.Start(ctx, tracingPrexfixIndex+"Init")
 	defer span.End()
@ -92,12 +133,12 @@ func (i *Index) Init(ctx context.Context) error {
 	resourceTypes := fetchResourceTypes()
 	totalObjectsFetched := 0
 	for _, rt := range resourceTypes {
-		i.log.Info("indexing resource", "kind", rt.Key.Resource)
-		r := &ListRequest{Options: rt, Limit: pageSize}
+		i.log.Info("indexing resource", "kind", rt.Key.Resource, "list_limit", i.opts.ListLimit, "batch_size", i.opts.BatchSize, "workers", i.opts.Workers)
+		r := &ListRequest{Options: rt, Limit: int64(i.opts.ListLimit)}

 		// Paginate through the list of resources and index each page
 		for {
-			i.log.Debug("fetching resource list", "kind", rt.Key.Resource)
+			i.log.Info("fetching resource list", "kind", rt.Key.Resource)
 			list, err := i.s.List(ctx, r)
 			if err != nil {
 				return err
@ -105,8 +146,15 @@ func (i *Index) Init(ctx context.Context) error {

 			totalObjectsFetched += len(list.Items)

-			// Index current page
-			err = i.IndexBatch(ctx, list)
+			i.log.Info("indexing batch", "kind", rt.Key.Resource, "count", len(list.Items))
+			//add changes to batches for shards with changes in the List
+			tenants, err := i.AddToBatches(ctx, list)
+			if err != nil {
+				return err
+			}
+
+			// Index the batches for tenants with changes if the batch is large enough
+			err = i.IndexBatches(ctx, i.opts.BatchSize, tenants)
 			if err != nil {
 				return err
 			}
@ -118,6 +166,14 @@ func (i *Index) Init(ctx context.Context) error {
 			r.NextPageToken = list.NextPageToken
 		}
 	}
+
+	//index all remaining batches
+	i.log.Info("indexing remaining batches", "shards", len(i.shards))
+	err := i.IndexBatches(ctx, 1, i.allTenants())
+	if err != nil {
+		return err
+	}
+
 	span.AddEvent("indexing finished", trace.WithAttributes(attribute.Int64("objects_indexed", int64(totalObjectsFetched))))
 	end := time.Now().Unix()
 	i.log.Info("Initial indexing finished", "seconds", float64(end-start))
@ -150,6 +206,9 @@ func (i *Index) Index(ctx context.Context, data *Data) error {

 	// record latency from when event was created to when it was indexed
 	latencySeconds := float64(time.Now().UnixMicro()-data.Value.ResourceVersion) / 1e6
+	if latencySeconds > 5 {
+		i.log.Warn("high index latency", "latency", latencySeconds)
+	}
 	if IndexServerMetrics != nil {
 		IndexServerMetrics.IndexLatency.WithLabelValues(data.Key.Resource).Observe(latencySeconds)
 	}
@ -234,9 +293,10 @@ func (i *Index) Count() (uint64, error) {
 }

 type Opts struct {
-	Workers    int // This controls how many goroutines are used to index objects
-	BatchSize  int // This is the batch size for how many objects to add to the index at once
-	Concurrent bool
+	Workers   int    // This controls how many goroutines are used to index objects
+	BatchSize int    // This is the batch size for how many objects to add to the index at once
+	ListLimit int    // This is how big the List page size is. If the response size is too large, the number of items will be limited by the server.
+	IndexDir  string // The directory where the indexes for each tenant are stored
 }

 func createFileIndex(path string) (bleve.Index, string, error) {
@ -248,12 +308,20 @@ func createFileIndex(path string) (bleve.Index, string, error) {
 	return index, indexPath, err
 }

+func (i *Index) allTenants() []string {
+	tenants := make([]string, 0, len(i.shards))
+	for tenant := range i.shards {
+		tenants = append(tenants, tenant)
+	}
+	return tenants
+}
+
 func (i *Index) getShard(tenant string) (Shard, error) {
 	shard, ok := i.shards[tenant]
 	if ok {
 		return shard, nil
 	}
-	index, path, err := createFileIndex(i.path)
+	index, path, err := createFileIndex(i.opts.IndexDir)
 	if err != nil {
 		return Shard{}, err
 	}
--- a/pkg/storage/unified/resource/index_server.go
+++ b/pkg/storage/unified/resource/index_server.go
@ -56,7 +56,13 @@ func (is *IndexServer) Load(ctx context.Context) error {
 	ctx, span := is.tracer.Start(ctx, tracingPrefixIndexServer+"Load")
 	defer span.End()

-	is.index = NewIndex(is.s, Opts{}, is.cfg.IndexPath, is.tracer)
+	opts := Opts{
+		Workers:   is.cfg.IndexWorkers,
+		BatchSize: is.cfg.IndexMaxBatchSize,
+		ListLimit: is.cfg.IndexListLimit,
+		IndexDir:  is.cfg.IndexPath,
+	}
+	is.index = NewIndex(is.s, opts, is.tracer)
 	err := is.index.Init(ctx)
 	if err != nil {
 		return err
--- a/pkg/storage/unified/resource/index_test.go
+++ b/pkg/storage/unified/resource/index_test.go
@ -20,24 +20,27 @@ func TestIndexBatch(t *testing.T) {
 	trace, err := tracing.ProvideService(tracingCfg)
 	if err != nil {
 		t.Fatal(err)
-		return
 	}

 	tmpdir := os.TempDir() + "testindexbatch"

 	defer func() {
-		err := os.RemoveAll(tmpdir)
+		err = os.RemoveAll(tmpdir)
 		if err != nil {
 			t.Fatal(err)
-			return
 		}
 	}()

 	index := &Index{
 		tracer: trace,
 		shards: make(map[string]Shard),
-		path:   tmpdir,
 		log:    log.New("unifiedstorage.search.index"),
+		opts: Opts{
+			IndexDir:  tmpdir,
+			ListLimit: 10000,
+			Workers:   10,
+			BatchSize: 10000,
+		},
 	}

 	ctx := context.Background()
@ -47,15 +50,17 @@ func TestIndexBatch(t *testing.T) {
 	for i := 0; i < 10; i++ {
 		list := &ListResponse{Items: loadTestItems(strconv.Itoa(i))}
 		start := time.Now()
-		err = index.IndexBatch(ctx, list)
+		_, err = index.AddToBatches(ctx, list)
 		if err != nil {
 			t.Fatal(err)
-			return
 		}
 		elapsed := time.Since(start)
 		fmt.Println("Time elapsed:", elapsed)
 	}

+	// index all batches for each shard/tenant
+	err = index.IndexBatches(ctx, 1, namespaces)
+
 	elapsed := time.Since(startAll)
 	fmt.Println("Total Time elapsed:", elapsed)

@ -64,7 +69,6 @@ func TestIndexBatch(t *testing.T) {
 	total, err := index.Count()
 	if err != nil {
 		t.Fatal(err)
-		return
 	}

 	assert.Equal(t, uint64(100000), total)