module-elasticsearch: include custom document id to batch upload (#30128)
* include custom document id to elasticsearch batch upload --------- Signed-off-by: Matheus Almeida <matheusjv14@gmail.com>
This commit is contained in:
@@ -0,0 +1,5 @@
|
||||
---
|
||||
'@backstage/plugin-search-backend-module-elasticsearch': patch
|
||||
---
|
||||
|
||||
Added support for batchKeyField in the Elasticsearch indexer to allow consistent document IDs during bulk uploads.
|
||||
@@ -249,6 +249,8 @@ lockdown
|
||||
lockfile
|
||||
lockfiles
|
||||
lookbehind
|
||||
lookup
|
||||
lookups
|
||||
lowercased
|
||||
lunr
|
||||
Luxon
|
||||
|
||||
@@ -239,6 +239,26 @@ search:
|
||||
|
||||
> You can also increase the batch size if you are using a large ES instance.
|
||||
|
||||
### Elasticsearch batch key field
|
||||
|
||||
By default, during bulk uploads with the Elasticsearch indexer, each document is assigned an auto-generated `_id` unless a `batchKeyField` is explicitly set. This configuration is optional and most users won’t need to customize it. However, if your use case involves frequent lookups or updates to existing documents, setting `batchKeyField` can be beneficial. It allows you to define a consistent identifier for each document, helping to streamline updates and prevent duplicate entries. Be aware that if the value provided for `batchKeyField` is not unique across documents, Elasticsearch will overwrite any existing document with the same `_id`.
|
||||
|
||||
**Using `batchKeyField` (Custom `_id`)**
|
||||
|
||||
```yaml
|
||||
search:
|
||||
elasticsearch:
|
||||
batchKeyField: document_id
|
||||
```
|
||||
|
||||
**Default Behavior (Auto-generated `_id`)**
|
||||
|
||||
```yaml
|
||||
search:
|
||||
elasticsearch:
|
||||
# No batchKeyField specified — Elasticsearch will autogenerate _id
|
||||
```
|
||||
|
||||
### Elasticsearch Index Name Customization
|
||||
|
||||
By default, the Elasticsearch indexer creates index names based on their type, a separator, and the current date as a postfix. You can configure a custom prefix for all indices by adding the following section to your app configuration.
|
||||
|
||||
@@ -29,6 +29,11 @@ export interface Config {
|
||||
* Batch size for elastic search indexing tasks. Defaults to 1000.
|
||||
*/
|
||||
batchSize?: number;
|
||||
/**
|
||||
* Defines the name of the field in each document that will be used to identify documents during a batch upload.
|
||||
* If not provided, a custom ID will be generated for each document.
|
||||
*/
|
||||
batchKeyField?: string;
|
||||
/**
|
||||
* Options for configuring highlight settings
|
||||
* See https://www.elastic.co/guide/en/elasticsearch/reference/7.17/highlighting.html
|
||||
|
||||
@@ -137,7 +137,7 @@ export class ElasticSearchClientWrapper {
|
||||
// (undocumented)
|
||||
bulk(bulkOptions: {
|
||||
datasource: Readable;
|
||||
onDocument: () => ElasticSearchIndexAction;
|
||||
onDocument: (doc: any) => ElasticSearchIndexAction;
|
||||
refreshOnCompletion?: string | boolean;
|
||||
}): BulkHelper<BulkStats>;
|
||||
// (undocumented)
|
||||
@@ -348,6 +348,7 @@ export class ElasticSearchSearchEngine implements SearchEngine {
|
||||
indexPrefix: string,
|
||||
logger: LoggerService,
|
||||
batchSize: number,
|
||||
batchKeyField?: string | undefined,
|
||||
highlightOptions?: ElasticSearchHighlightOptions,
|
||||
queryOptions?: ElasticSearchQueryConfig,
|
||||
);
|
||||
@@ -393,6 +394,7 @@ export type ElasticSearchSearchEngineIndexerOptions = {
|
||||
logger: LoggerService;
|
||||
elasticSearchClientWrapper: ElasticSearchClientWrapper;
|
||||
batchSize: number;
|
||||
batchKeyField?: string;
|
||||
skipRefresh?: boolean;
|
||||
};
|
||||
|
||||
|
||||
+1
-1
@@ -115,7 +115,7 @@ export class ElasticSearchClientWrapper {
|
||||
|
||||
bulk(bulkOptions: {
|
||||
datasource: Readable;
|
||||
onDocument: () => ElasticSearchIndexAction;
|
||||
onDocument: (doc: any) => ElasticSearchIndexAction;
|
||||
refreshOnCompletion?: string | boolean;
|
||||
}) {
|
||||
if (this.openSearchClient) {
|
||||
|
||||
@@ -142,6 +142,7 @@ export class ElasticSearchSearchEngine implements SearchEngine {
|
||||
private readonly indexPrefix: string,
|
||||
private readonly logger: LoggerService,
|
||||
private readonly batchSize: number,
|
||||
private readonly batchKeyField?: string,
|
||||
highlightOptions?: ElasticSearchHighlightOptions,
|
||||
queryOptions?: ElasticSearchQueryConfig,
|
||||
) {
|
||||
@@ -189,6 +190,7 @@ export class ElasticSearchSearchEngine implements SearchEngine {
|
||||
logger,
|
||||
config.getOptionalNumber('search.elasticsearch.batchSize') ??
|
||||
DEFAULT_INDEXER_BATCH_SIZE,
|
||||
config.getOptionalString('search.elasticsearch.batchKeyField'),
|
||||
config.getOptional<ElasticSearchHighlightOptions>(
|
||||
'search.elasticsearch.highlightOptions',
|
||||
),
|
||||
@@ -345,6 +347,7 @@ export class ElasticSearchSearchEngine implements SearchEngine {
|
||||
elasticSearchClientWrapper: this.elasticSearchClientWrapper,
|
||||
logger: indexerLogger,
|
||||
batchSize: this.batchSize,
|
||||
batchKeyField: this.batchKeyField,
|
||||
skipRefresh:
|
||||
(
|
||||
this
|
||||
|
||||
+76
@@ -351,4 +351,80 @@ describe('ElasticSearchSearchEngineIndexer', () => {
|
||||
expect(bulkSpy).toHaveBeenCalledTimes(1);
|
||||
expect(refreshSpy).toHaveBeenCalledTimes(0);
|
||||
});
|
||||
|
||||
it('indexes documents with custom batch key field', async () => {
|
||||
indexer = new ElasticSearchSearchEngineIndexer({
|
||||
type: 'some-type',
|
||||
indexPrefix: '',
|
||||
indexSeparator: '-index__',
|
||||
alias: 'some-type-index__search',
|
||||
logger: mockServices.logger.mock(),
|
||||
elasticSearchClientWrapper: clientWrapper,
|
||||
batchSize: 1000,
|
||||
skipRefresh: false,
|
||||
batchKeyField: 'customId',
|
||||
});
|
||||
|
||||
const documents = [
|
||||
{
|
||||
title: 'testTerm',
|
||||
text: 'testText',
|
||||
location: 'test/location',
|
||||
customId: '123',
|
||||
},
|
||||
{
|
||||
title: 'Another test',
|
||||
text: 'Some more text',
|
||||
location: 'test/location/2',
|
||||
customId: '456',
|
||||
},
|
||||
];
|
||||
|
||||
await TestPipeline.fromIndexer(indexer).withDocuments(documents).execute();
|
||||
|
||||
expect(bulkSpy).toHaveBeenCalled();
|
||||
|
||||
const bulkBody = bulkSpy.mock.calls[0][0].body;
|
||||
|
||||
expect(bulkBody[0]).toEqual(
|
||||
expect.objectContaining({
|
||||
_id: '123',
|
||||
index: expect.objectContaining({
|
||||
_index: expect.stringContaining('some-type-index__'),
|
||||
}),
|
||||
}),
|
||||
);
|
||||
expect(bulkBody[2]).toEqual(
|
||||
expect.objectContaining({
|
||||
_id: '456',
|
||||
index: expect.objectContaining({
|
||||
_index: expect.stringContaining('some-type-index__'),
|
||||
}),
|
||||
}),
|
||||
);
|
||||
}, 40000);
|
||||
|
||||
it('indexes documents without custom batch key field when not specified', async () => {
|
||||
const documents = [
|
||||
{
|
||||
title: 'testTerm',
|
||||
text: 'testText',
|
||||
location: 'test/location',
|
||||
customId: '123',
|
||||
},
|
||||
];
|
||||
|
||||
await TestPipeline.fromIndexer(indexer).withDocuments(documents).execute();
|
||||
|
||||
const bulkBody = bulkSpy.mock.calls[0][0].body;
|
||||
|
||||
expect(bulkBody[0]).not.toHaveProperty('_id');
|
||||
expect(bulkBody[0]).toEqual(
|
||||
expect.objectContaining({
|
||||
index: expect.objectContaining({
|
||||
_index: expect.stringContaining('some-type-index__'),
|
||||
}),
|
||||
}),
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
+5
-1
@@ -32,6 +32,7 @@ export type ElasticSearchSearchEngineIndexerOptions = {
|
||||
logger: LoggerService;
|
||||
elasticSearchClientWrapper: ElasticSearchClientWrapper;
|
||||
batchSize: number;
|
||||
batchKeyField?: string;
|
||||
skipRefresh?: boolean;
|
||||
};
|
||||
|
||||
@@ -87,10 +88,13 @@ export class ElasticSearchSearchEngineIndexer extends BatchSearchEngineIndexer {
|
||||
// documents have been successfully written to ES.
|
||||
this.bulkResult = this.elasticSearchClientWrapper.bulk({
|
||||
datasource: this.sourceStream,
|
||||
onDocument() {
|
||||
onDocument(doc) {
|
||||
that.processed++;
|
||||
return {
|
||||
index: { _index: that.indexName },
|
||||
...(options.batchKeyField && doc[options.batchKeyField]
|
||||
? { _id: doc[options.batchKeyField] }
|
||||
: {}),
|
||||
};
|
||||
},
|
||||
refreshOnCompletion: options.skipRefresh !== true,
|
||||
|
||||
Reference in New Issue
Block a user