module-elasticsearch: include custom document id to batch upload (#30128)

* include custom document id to elasticsearch batch upload

---------

Signed-off-by: Matheus Almeida <matheusjv14@gmail.com>
This commit is contained in:
Matheus Almeida
2025-09-15 12:16:24 -03:00
committed by GitHub
parent d18818be84
commit cde70cabfd
9 changed files with 120 additions and 3 deletions
+5
View File
@@ -0,0 +1,5 @@
---
'@backstage/plugin-search-backend-module-elasticsearch': patch
---
Added support for batchKeyField in the Elasticsearch indexer to allow consistent document IDs during bulk uploads.
@@ -249,6 +249,8 @@ lockdown
lockfile
lockfiles
lookbehind
lookup
lookups
lowercased
lunr
Luxon
+20
View File
@@ -239,6 +239,26 @@ search:
> You can also increase the batch size if you are using a large ES instance.
### Elasticsearch batch key field
By default, during bulk uploads with the Elasticsearch indexer, each document is assigned an auto-generated `_id` unless a `batchKeyField` is explicitly set. This configuration is optional and most users wont need to customize it. However, if your use case involves frequent lookups or updates to existing documents, setting `batchKeyField` can be beneficial. It allows you to define a consistent identifier for each document, helping to streamline updates and prevent duplicate entries. Be aware that if the value provided for `batchKeyField` is not unique across documents, Elasticsearch will overwrite any existing document with the same `_id`.
**Using `batchKeyField` (Custom `_id`)**
```yaml
search:
elasticsearch:
batchKeyField: document_id
```
**Default Behavior (Auto-generated `_id`)**
```yaml
search:
elasticsearch:
# No batchKeyField specified — Elasticsearch will autogenerate _id
```
### Elasticsearch Index Name Customization
By default, the Elasticsearch indexer creates index names based on their type, a separator, and the current date as a postfix. You can configure a custom prefix for all indices by adding the following section to your app configuration.
@@ -29,6 +29,11 @@ export interface Config {
* Batch size for elastic search indexing tasks. Defaults to 1000.
*/
batchSize?: number;
/**
* Defines the name of the field in each document that will be used to identify documents during a batch upload.
* If not provided, a custom ID will be generated for each document.
*/
batchKeyField?: string;
/**
* Options for configuring highlight settings
* See https://www.elastic.co/guide/en/elasticsearch/reference/7.17/highlighting.html
@@ -137,7 +137,7 @@ export class ElasticSearchClientWrapper {
// (undocumented)
bulk(bulkOptions: {
datasource: Readable;
onDocument: () => ElasticSearchIndexAction;
onDocument: (doc: any) => ElasticSearchIndexAction;
refreshOnCompletion?: string | boolean;
}): BulkHelper<BulkStats>;
// (undocumented)
@@ -348,6 +348,7 @@ export class ElasticSearchSearchEngine implements SearchEngine {
indexPrefix: string,
logger: LoggerService,
batchSize: number,
batchKeyField?: string | undefined,
highlightOptions?: ElasticSearchHighlightOptions,
queryOptions?: ElasticSearchQueryConfig,
);
@@ -393,6 +394,7 @@ export type ElasticSearchSearchEngineIndexerOptions = {
logger: LoggerService;
elasticSearchClientWrapper: ElasticSearchClientWrapper;
batchSize: number;
batchKeyField?: string;
skipRefresh?: boolean;
};
@@ -115,7 +115,7 @@ export class ElasticSearchClientWrapper {
bulk(bulkOptions: {
datasource: Readable;
onDocument: () => ElasticSearchIndexAction;
onDocument: (doc: any) => ElasticSearchIndexAction;
refreshOnCompletion?: string | boolean;
}) {
if (this.openSearchClient) {
@@ -142,6 +142,7 @@ export class ElasticSearchSearchEngine implements SearchEngine {
private readonly indexPrefix: string,
private readonly logger: LoggerService,
private readonly batchSize: number,
private readonly batchKeyField?: string,
highlightOptions?: ElasticSearchHighlightOptions,
queryOptions?: ElasticSearchQueryConfig,
) {
@@ -189,6 +190,7 @@ export class ElasticSearchSearchEngine implements SearchEngine {
logger,
config.getOptionalNumber('search.elasticsearch.batchSize') ??
DEFAULT_INDEXER_BATCH_SIZE,
config.getOptionalString('search.elasticsearch.batchKeyField'),
config.getOptional<ElasticSearchHighlightOptions>(
'search.elasticsearch.highlightOptions',
),
@@ -345,6 +347,7 @@ export class ElasticSearchSearchEngine implements SearchEngine {
elasticSearchClientWrapper: this.elasticSearchClientWrapper,
logger: indexerLogger,
batchSize: this.batchSize,
batchKeyField: this.batchKeyField,
skipRefresh:
(
this
@@ -351,4 +351,80 @@ describe('ElasticSearchSearchEngineIndexer', () => {
expect(bulkSpy).toHaveBeenCalledTimes(1);
expect(refreshSpy).toHaveBeenCalledTimes(0);
});
it('indexes documents with custom batch key field', async () => {
indexer = new ElasticSearchSearchEngineIndexer({
type: 'some-type',
indexPrefix: '',
indexSeparator: '-index__',
alias: 'some-type-index__search',
logger: mockServices.logger.mock(),
elasticSearchClientWrapper: clientWrapper,
batchSize: 1000,
skipRefresh: false,
batchKeyField: 'customId',
});
const documents = [
{
title: 'testTerm',
text: 'testText',
location: 'test/location',
customId: '123',
},
{
title: 'Another test',
text: 'Some more text',
location: 'test/location/2',
customId: '456',
},
];
await TestPipeline.fromIndexer(indexer).withDocuments(documents).execute();
expect(bulkSpy).toHaveBeenCalled();
const bulkBody = bulkSpy.mock.calls[0][0].body;
expect(bulkBody[0]).toEqual(
expect.objectContaining({
_id: '123',
index: expect.objectContaining({
_index: expect.stringContaining('some-type-index__'),
}),
}),
);
expect(bulkBody[2]).toEqual(
expect.objectContaining({
_id: '456',
index: expect.objectContaining({
_index: expect.stringContaining('some-type-index__'),
}),
}),
);
}, 40000);
it('indexes documents without custom batch key field when not specified', async () => {
const documents = [
{
title: 'testTerm',
text: 'testText',
location: 'test/location',
customId: '123',
},
];
await TestPipeline.fromIndexer(indexer).withDocuments(documents).execute();
const bulkBody = bulkSpy.mock.calls[0][0].body;
expect(bulkBody[0]).not.toHaveProperty('_id');
expect(bulkBody[0]).toEqual(
expect.objectContaining({
index: expect.objectContaining({
_index: expect.stringContaining('some-type-index__'),
}),
}),
);
});
});
@@ -32,6 +32,7 @@ export type ElasticSearchSearchEngineIndexerOptions = {
logger: LoggerService;
elasticSearchClientWrapper: ElasticSearchClientWrapper;
batchSize: number;
batchKeyField?: string;
skipRefresh?: boolean;
};
@@ -87,10 +88,13 @@ export class ElasticSearchSearchEngineIndexer extends BatchSearchEngineIndexer {
// documents have been successfully written to ES.
this.bulkResult = this.elasticSearchClientWrapper.bulk({
datasource: this.sourceStream,
onDocument() {
onDocument(doc) {
that.processed++;
return {
index: { _index: that.indexName },
...(options.batchKeyField && doc[options.batchKeyField]
? { _id: doc[options.batchKeyField] }
: {}),
};
},
refreshOnCompletion: options.skipRefresh !== true,