From cd914b79254f783f25bab020e0b40303f6f6f03f Mon Sep 17 00:00:00 2001 From: fhuaulme Date: Fri, 3 Apr 2026 10:56:36 +0200 Subject: [PATCH 1/6] SOLR-18189: Improve tracking of duplicate Solr docs with content hash https://issues.apache.org/jira/browse/SOLR-18189 --- .../ContentHashVersionProcessor.java | 196 +++++++++ .../ContentHashVersionProcessorFactory.java | 144 +++++++ ...ontentHashVersionProcessorFactoryTest.java | 147 +++++++ .../ContentHashVersionProcessorTest.java | 399 ++++++++++++++++++ 4 files changed, 886 insertions(+) create mode 100644 solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java create mode 100644 solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessorFactory.java create mode 100644 solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorFactoryTest.java create mode 100644 solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorTest.java diff --git a/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java b/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java new file mode 100644 index 00000000000..5127a6453f2 --- /dev/null +++ b/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java @@ -0,0 +1,196 @@ +package org.apache.solr.update.processor; + +import org.apache.lucene.util.BytesRef; +import org.apache.solr.common.SolrException; +import org.apache.solr.common.SolrInputDocument; +import org.apache.solr.core.SolrCore; +import org.apache.solr.handler.component.RealTimeGetComponent; +import org.apache.solr.handler.component.RealTimeGetComponent.Resolution; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.SolrQueryResponse; +import org.apache.solr.schema.SchemaField; +import org.apache.solr.schema.TextField; +import org.apache.solr.update.AddUpdateCommand; +import org.apache.solr.update.UpdateCommand; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.lang.invoke.MethodHandles; +import java.util.Base64; +import java.util.Collection; +import java.util.List; +import java.util.Objects; +import java.util.Set; +import java.util.function.Predicate; + +/** + * An implementation of {@link UpdateRequestProcessor} which computes a hash of selected doc values, and uses this hash + * value to reject/accept doc updates. + * + * Depending on {#discardSameDocuments} value, this processor may reject or accept doc update. + * This implementation can be used for monitoring or rejecting no-op updates (updates that do not change Solr document). + *

+ * Note: hash is computed using {@link Lookup3Signature}. + *

+ * @see Lookup3Signature + */ +public class ContentHashVersionProcessor extends UpdateRequestProcessor { + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private final SchemaField hashField; + private final SolrQueryResponse rsp; + private final SolrCore core; + private final Predicate includedFields; // Matcher for included fields in hash + private final Predicate excludedFields; // Matcher for excluded fields from hash + private OldDocProvider oldDocProvider = new DefaultDocProvider(); + private boolean discardSameDocuments; + private int sameCount = 0; + private int differentCount = 0; + private int unknownCount = 0; + + public ContentHashVersionProcessor(Predicate hashIncludedFields, Predicate hashExcludedFields, String hashFieldName, SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor next) { + super(next); + this.core = req.getCore(); + this.hashField = new SchemaField(hashFieldName, new TextField()); + this.rsp = rsp; + this.includedFields = hashIncludedFields; + this.excludedFields = hashExcludedFields; + } + + public void processAdd(AddUpdateCommand cmd) throws IOException { + SolrInputDocument newDoc = cmd.getSolrInputDocument(); + String newHash = computeDocHash(newDoc); + newDoc.setField(hashField.getName(), newHash); + int i = 0; + + if (!validateHash(cmd.getIndexedId(), newHash)) { + return; + } + + while (true) { + logOverlyFailedRetries(i, cmd); + try { + super.processAdd(cmd); + return; + } catch (SolrException e) { + if (e.code() != 409) { + throw e; + } + ++i; + } + } + } + + @Override + public void finish() throws IOException { + try { + super.finish(); + } finally { + rsp.addToLog("numAddsExisting", sameCount + differentCount + unknownCount); + rsp.addToLog("numAddsExistingWithIdentical", sameCount); + rsp.addToLog("numAddsExistingUnknown", unknownCount); + } + } + + private static void logOverlyFailedRetries(int i, UpdateCommand cmd) { + if ((i & 255) == 255) { + log.warn("Unusual number of optimistic concurrency retries: retries={} cmd={}", i, cmd); + } + } + + void setOldDocProvider(OldDocProvider oldDocProvider) { + this.oldDocProvider = oldDocProvider; + } + + void setDiscardSameDocuments(boolean discardSameDocuments) { + this.discardSameDocuments = discardSameDocuments; + } + + private boolean validateHash(BytesRef indexedDocId, String newHash) throws IOException { + assert null != indexedDocId; + + var docFoundAndOldUserVersions = getOldUserVersionsFromStored(indexedDocId); + if (docFoundAndOldUserVersions.found) { + String oldHash = docFoundAndOldUserVersions.oldHash; // No hash: might want to keep track of these too + if (oldHash == null) { + unknownCount++; + return true; + } else if (Objects.equals(newHash, oldHash)) { + sameCount++; + return !discardSameDocuments; + } else { + differentCount++; + return true; + } + } + return true; // Doc not found + } + + private DocFoundAndOldUserAndSolrVersions getOldUserVersionsFromStored(BytesRef indexedDocId) throws IOException { + SolrInputDocument oldDoc = oldDocProvider.getDocument(core, hashField.getName(), indexedDocId); + return null == oldDoc ? DocFoundAndOldUserAndSolrVersions.NOT_FOUND : getUserVersionAndSolrVersionFromDocument(oldDoc); + } + + private DocFoundAndOldUserAndSolrVersions getUserVersionAndSolrVersionFromDocument(SolrInputDocument oldDoc) { + Object o = oldDoc.getFieldValue(hashField.getName()); + if (o != null) { + return new DocFoundAndOldUserAndSolrVersions(o.toString()); + } + return new DocFoundAndOldUserAndSolrVersions(); + } + + public String computeDocHash(SolrInputDocument doc) { + List docIncludedFieldNames = doc.getFieldNames().stream() + .filter(includedFields) // Keep fields that match 'included fields' matcher... + .filter(excludedFields.negate()) // ...and exclude fields that match 'excluded fields' matcher + .sorted() // Sort to ensure consistent field order across different doc field orders + .toList(); + + final Signature sig = new Lookup3Signature(); + for (String fieldName : docIncludedFieldNames) { + sig.add(fieldName); + Object o = doc.getFieldValue(fieldName); + if (o instanceof Collection) { + for (Object oo : (Collection) o) { + sig.add(String.valueOf(oo)); + } + } else { + sig.add(String.valueOf(o)); + } + } + + // Signature, depending on implementation, may return 8-byte or 16-byte value + byte[] signature = sig.getSignature(); + return Base64.getEncoder().encodeToString(signature); // Makes a base64 hash out of signature value + } + + interface OldDocProvider { + SolrInputDocument getDocument(SolrCore core, String hashField, BytesRef indexedDocId) throws IOException; + } + + private static class DefaultDocProvider implements OldDocProvider { + @Override + public SolrInputDocument getDocument(SolrCore core, String hashField, BytesRef indexedDocId) throws IOException { + return RealTimeGetComponent.getInputDocument(core, indexedDocId, indexedDocId, null, Set.of(hashField), Resolution.PARTIAL); + } + } + + private static class DocFoundAndOldUserAndSolrVersions { + private static final DocFoundAndOldUserAndSolrVersions NOT_FOUND = new DocFoundAndOldUserAndSolrVersions(); + private final boolean found; + + public String oldHash; + + private DocFoundAndOldUserAndSolrVersions() { + this.found = false; + } + private DocFoundAndOldUserAndSolrVersions(String oldHash) { + this.found = true; + this.oldHash = oldHash; + } + + } +} diff --git a/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessorFactory.java b/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessorFactory.java new file mode 100644 index 00000000000..a59c98e5882 --- /dev/null +++ b/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessorFactory.java @@ -0,0 +1,144 @@ +package org.apache.solr.update.processor; + +import org.apache.solr.common.SolrException; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.common.util.StrUtils; +import org.apache.solr.core.SolrCore; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.SolrQueryResponse; +import org.apache.solr.util.plugin.SolrCoreAware; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.function.Predicate; +import java.util.stream.Collectors; + +/** + * Factory for {@link ContentHashVersionProcessor} instances. + */ +public class ContentHashVersionProcessorFactory extends UpdateRequestProcessorFactory implements SolrCoreAware, UpdateRequestProcessorFactory.RunAlways { + private static final char SEPARATOR = ','; // Separator for included/excluded fields + static final String CONTENT_HASH_ENABLED_PARAM = "contentHashEnabled"; + private List includeFields = Collections.singletonList("*"); // Included fields defaults to 'all' + private List excludeFields = new ArrayList<>(); // No excluded field by default, yet hashFieldName is excluded by default + private String hashFieldName = "content_hash"; // Field name to store computed hash on create/update operations + private boolean discardSameDocuments = true; + + public ContentHashVersionProcessorFactory() { + } + + public void init(NamedList args) { + Object tmp = args.remove("includeFields"); + if (tmp != null) { + if (!(tmp instanceof String)) { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, + "'includeFields' must be configured as a "); + } + // Include fields support comma separated list of fields (e.g. "field1,field2,field3"). + // Also supports "*" to include all fields + this.includeFields = StrUtils.splitSmart((String) tmp, SEPARATOR) + .stream() + .map(String::trim) + .collect(Collectors.toList()); + } + tmp = args.remove("hashFieldName"); + if (tmp != null) { + if (!(tmp instanceof String)) { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "'hashFieldName' must be configured as a "); + } + this.hashFieldName = (String) tmp; + } + + tmp = args.remove("excludeFields"); + if (tmp != null) { + if (!(tmp instanceof String)) { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "'excludeFields' must be configured as a "); + } + if ("*".equals(((String) tmp).trim())) { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "'excludeFields' can't exclude all fields."); + } + // Exclude fields support comma separated list of fields (e.g. "excluded_field1,excluded_field2"). + // Also supports "*" to exclude all fields + this.excludeFields = StrUtils.splitSmart((String) tmp, SEPARATOR) + .stream() + .map(String::trim) + .collect(Collectors.toList()); + } + excludeFields.add(hashFieldName); // Hash field name is excluded from hash computation + + tmp = args.remove("hashCompareStrategy"); + if (tmp != null) { + if (!(tmp instanceof String)) { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "'hashCompareStrategy' must be configured as a "); + } + String value = ((String) tmp).toLowerCase(); + if ("discard".equalsIgnoreCase(value)) { + discardSameDocuments = true; + } else if ("log".equalsIgnoreCase(value)) { + discardSameDocuments = false; + } else { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Value '" + value + "' is unsupported for 'hashCompareStrategy', only 'discard' and 'log' are supported."); + } + } + + super.init(args); + } + + public UpdateRequestProcessor getInstance(SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor next) { + if (!req.getParams().getBool(CONTENT_HASH_ENABLED_PARAM, false)) { + return next; + } + + ContentHashVersionProcessor processor = new ContentHashVersionProcessor( + buildFieldMatcher(includeFields), + buildFieldMatcher(excludeFields), + hashFieldName, + req, + rsp, + next); + processor.setDiscardSameDocuments(discardSameDocuments); + return processor; + } + + public void inform(SolrCore core) { + if (core.getLatestSchema().getUniqueKeyField() == null) { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "schema must have uniqueKey defined."); + } + } + + public String getHashFieldName() { + return hashFieldName; + } + + public List getIncludeFields() { + return includeFields; + } + + public List getExcludeFields() { + return excludeFields; + } + + public boolean discardSameDocuments() { + return discardSameDocuments; + } + + static Predicate buildFieldMatcher(List fieldNames) { + return fieldName -> { + for (String currentFieldName : fieldNames) { + if ("*".equals(currentFieldName)) { + return true; + } + if (fieldName.equals(currentFieldName)) { + return true; + } + if (currentFieldName.length() > 1 + && currentFieldName.endsWith("*") + && fieldName.startsWith(currentFieldName.substring(0, currentFieldName.length() - 1))) { + return true; + } + } + return false; + }; + } +} diff --git a/solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorFactoryTest.java b/solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorFactoryTest.java new file mode 100644 index 00000000000..51af18a075d --- /dev/null +++ b/solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorFactoryTest.java @@ -0,0 +1,147 @@ +package org.apache.solr.update.processor; + +import org.apache.solr.common.SolrException; +import org.apache.solr.common.params.SolrParams; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.SolrQueryResponse; +import org.junit.Test; + +import java.util.List; + +import static org.apache.solr.update.processor.ContentHashVersionProcessorFactory.CONTENT_HASH_ENABLED_PARAM; +import static org.junit.Assert.*; +import static org.mockito.ArgumentMatchers.*; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +public class ContentHashVersionProcessorFactoryTest { + + @Test + public void shouldHaveSensibleDefaultValues() { + ContentHashVersionProcessorFactory factory = new ContentHashVersionProcessorFactory(); + assertEquals(List.of("*"), factory.getIncludeFields()); + assertEquals("content_hash", factory.getHashFieldName()); + assertTrue(factory.discardSameDocuments()); + } + + @Test + public void shouldInitWithHashFieldName() { + ContentHashVersionProcessorFactory factory = new ContentHashVersionProcessorFactory(); + NamedList args = new NamedList<>(); + args.add("hashFieldName", "_hash_field_"); + factory.init(args); + + assertEquals("_hash_field_", factory.getHashFieldName()); + } + + @Test + public void shouldInitWithAllField() { + ContentHashVersionProcessorFactory factory = new ContentHashVersionProcessorFactory(); + NamedList args = new NamedList<>(); + args.add("includeFields", "*"); + factory.init(args); + + assertEquals(1, factory.getIncludeFields().size()); + assertEquals("*", factory.getIncludeFields().getFirst()); + } + + @Test + public void shouldInitWithIncludedFields() { + ContentHashVersionProcessorFactory factory = new ContentHashVersionProcessorFactory(); + NamedList args = new NamedList<>(); + args.add("includeFields", " field1,field2 , field3 "); + factory.init(args); + + assertEquals(3, factory.getIncludeFields().size()); + assertEquals(List.of("field1", "field2", "field3"), factory.getIncludeFields()); + } + + @Test + public void shouldInitWithExcludedFields() { + ContentHashVersionProcessorFactory factory = new ContentHashVersionProcessorFactory(); + NamedList args = new NamedList<>(); + args.add("excludeFields", " field1,field2 , field3 "); + factory.init(args); + + assertEquals(4, factory.getExcludeFields().size()); + assertEquals(List.of("field1", "field2", "field3", "content_hash"), factory.getExcludeFields()); + } + + @Test + public void shouldSelectRejectSameHashStrategy() { + ContentHashVersionProcessorFactory factory = new ContentHashVersionProcessorFactory(); + NamedList args = new NamedList<>(); + args.add("hashCompareStrategy", "discard"); + factory.init(args); + + assertTrue(factory.discardSameDocuments()); + } + + @Test + public void shouldSelectLogStrategy() { + ContentHashVersionProcessorFactory factory = new ContentHashVersionProcessorFactory(); + NamedList args = new NamedList<>(); + args.add("hashCompareStrategy", "log"); + factory.init(args); + + assertFalse(factory.discardSameDocuments()); + } + + @Test + public void shouldSelectDiscardStrategy() { + ContentHashVersionProcessorFactory factory = new ContentHashVersionProcessorFactory(); + NamedList args = new NamedList<>(); + args.add("hashCompareStrategy", "discard"); + factory.init(args); + + assertTrue(factory.discardSameDocuments()); + } + + @Test(expected = SolrException.class) + public void shouldSelectUnsupportedStrategy() { + ContentHashVersionProcessorFactory factory = new ContentHashVersionProcessorFactory(); + NamedList args = new NamedList<>(); + args.add("hashCompareStrategy", "unsupported value"); + factory.init(args); + } + + @Test(expected = SolrException.class) + public void shouldRejectExcludeAllFields() { + ContentHashVersionProcessorFactory factory = new ContentHashVersionProcessorFactory(); + NamedList args = new NamedList<>(); + args.add("excludeFields", "*"); + factory.init(args); + } + + @Test + public void shouldDisableContentHashByQueryParameter() { + ContentHashVersionProcessorFactory factory = new ContentHashVersionProcessorFactory(); + UpdateRequestProcessor next = mock(UpdateRequestProcessor.class); + SolrQueryRequest updateRequest = createUpdateRequest(false); // Request disables processor + + UpdateRequestProcessor instance = factory.getInstance(updateRequest, mock(SolrQueryResponse.class), next); + + assertEquals(instance, next); + } + + @Test + public void shouldEnableContentHashByQueryParameter() { + ContentHashVersionProcessorFactory factory = new ContentHashVersionProcessorFactory(); + UpdateRequestProcessor next = mock(UpdateRequestProcessor.class); + SolrQueryRequest updateRequest = createUpdateRequest(true); // Request enables processor + + UpdateRequestProcessor instance = factory.getInstance(updateRequest, mock(SolrQueryResponse.class), next); + + assertNotEquals(instance, next); + } + + private static SolrQueryRequest createUpdateRequest(boolean enableContentHashParamValue) { + SolrQueryRequest req = mock(SolrQueryRequest.class); + SolrParams solrParams = mock(SolrParams.class); + when(solrParams.getBool(eq(CONTENT_HASH_ENABLED_PARAM), anyBoolean())).thenReturn(enableContentHashParamValue); + when(req.getParams()).thenReturn(solrParams); + + return req; + } +} diff --git a/solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorTest.java b/solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorTest.java new file mode 100644 index 00000000000..1f399816ce7 --- /dev/null +++ b/solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorTest.java @@ -0,0 +1,399 @@ +package org.apache.solr.update.processor; + +import org.apache.lucene.util.BytesRef; +import org.apache.solr.common.SolrInputDocument; +import org.apache.solr.common.params.SolrParams; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.SolrQueryResponse; +import org.apache.solr.schema.IndexSchema; +import org.apache.solr.schema.SchemaField; +import org.apache.solr.schema.UUIDField; +import org.apache.solr.update.AddUpdateCommand; +import org.junit.Before; +import org.junit.Test; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.UUID; + +import static org.mockito.ArgumentMatchers.anyString; +import static org.mockito.Mockito.*; + +public class ContentHashVersionProcessorTest extends UpdateProcessorTestBase { + + public static final String ID_FIELD = "_id"; + public static final String HASH_FIELD_NAME = "_hash_"; + public static final String FIRST_FIELD = "field1"; + public static final String SECOND_FIELD = "field2"; + public static final String THIRD_FIELD = "docField3"; + public static final String FOURTH_FIELD = "field4"; + private SolrQueryRequest req; + + private ContentHashVersionProcessor getContentHashVersionProcessor( + SolrQueryRequest req, + SolrQueryResponse rsp, + UpdateRequestProcessor next, + List includedFields, + List excludedFields) { + ContentHashVersionProcessor processor = new ContentHashVersionProcessor( + ContentHashVersionProcessorFactory.buildFieldMatcher(includedFields), + ContentHashVersionProcessorFactory.buildFieldMatcher(excludedFields), + HASH_FIELD_NAME, + req, + rsp, + next); + + // Given (previous doc retrieval configuration) + processor.setOldDocProvider((core, hashField, indexedDocId) -> { + final SolrInputDocument inputDocument = doc( + f(ID_FIELD, indexedDocId.utf8ToString()), + f(FIRST_FIELD, "Initial values used to compute initial hash"), + f(SECOND_FIELD, "This a constant value for testing include/exclude fields") + ); + return doc( + f(ID_FIELD, inputDocument.getFieldValue(ID_FIELD)), + f(FIRST_FIELD, inputDocument.getFieldValue(FIRST_FIELD)), + f(SECOND_FIELD, inputDocument.getFieldValue(SECOND_FIELD)), + f(hashField, processor.computeDocHash(inputDocument)) + ); + }); + return processor; + } + + @Before + public void setUp() throws Exception { + super.setUp(); + + // Given (processor configuration) + req = mock(SolrQueryRequest.class); + when(req.getParams()).thenReturn(mock(SolrParams.class)); + + // Given (schema configuration) + IndexSchema indexSchema = mock(IndexSchema.class); + when(req.getSchema()).thenReturn(indexSchema); + when(indexSchema.getUniqueKeyField()).thenReturn(new SchemaField(ID_FIELD, new UUIDField())); + when(indexSchema.indexableUniqueKey(anyString())).then(invocationOnMock -> new BytesRef(invocationOnMock.getArgument( + 0).toString())); + } + + @Test + public void shouldComputeHashForDoc() { + // Given + ContentHashVersionProcessor processor = getContentHashVersionProcessor( + req, + mock(SolrQueryResponse.class), + mock(UpdateRequestProcessor.class), + List.of("*"), + List.of(ID_FIELD)); + + // Given (doc for update) + SolrInputDocument inputDocument1 = doc( + f(ID_FIELD, UUID.randomUUID().toString()), + f(FIRST_FIELD, "Values will serve as input to compute a hash"), + f(SECOND_FIELD, "This a constant value for testing include/exclude fields") + ); + + // Then + assertEquals("Tak0G5a/DIE=", processor.computeDocHash(inputDocument1)); + + // Given (doc for update - with different order) + SolrInputDocument inputDocument2 = doc( + f(ID_FIELD, UUID.randomUUID().toString()), + f(SECOND_FIELD, "This a constant value for testing include/exclude fields"), + f(FIRST_FIELD, "Values will serve as input to compute a hash") + ); + + // Then (hash remain same, since id is excluded from signature fields) + assertEquals("Tak0G5a/DIE=", processor.computeDocHash(inputDocument2)); + } + + @Test + public void shouldUseExcludedFieldsWildcard() { + // Given + ContentHashVersionProcessor processor = getContentHashVersionProcessor( + req, + mock(SolrQueryResponse.class), mock(UpdateRequestProcessor.class), + List.of("*"), + List.of("field*")); + + // Given (doc for update) + SolrInputDocument inputDocument = doc( + f(ID_FIELD, "0000000001"), + f(FIRST_FIELD, UUID.randomUUID().toString()), + f(SECOND_FIELD, UUID.randomUUID().toString()), + f(THIRD_FIELD, "constant to have a constant hash"), + f(FOURTH_FIELD, UUID.randomUUID().toString()) + ); + + // Then (only ID and THIRD_FIELD is used in hash, other fields contain random values) + assertEquals("bwE8Zjq0aOs=", processor.computeDocHash(inputDocument)); // Hash if only ID field was used + } + + @Test + public void shouldUseIncludedFieldsWildcard() { + // Given + ContentHashVersionProcessor processor = getContentHashVersionProcessor( + req, + mock(SolrQueryResponse.class), mock(UpdateRequestProcessor.class), + List.of("field*"), + List.of(THIRD_FIELD)); + + // Given (doc for update) + SolrInputDocument inputDocument = doc( + f(ID_FIELD, "0000000001"), + f(FIRST_FIELD, "constant to have a constant hash for field1"), + f(SECOND_FIELD, "constant to have a constant hash for field2"), + f(THIRD_FIELD, UUID.randomUUID().toString()), + f(FOURTH_FIELD, "constant to have a constant hash for field4") + ); + + // Then + assertEquals("PozPs2qZQtw=", processor.computeDocHash(inputDocument)); + } + + @Test + public void shouldUseIncludedFieldsWildcard2() { + // Given (variant of previous shouldUseIncludedFieldsWildcard, without the excludedField config) + ContentHashVersionProcessor processor = getContentHashVersionProcessor( + req, + mock(SolrQueryResponse.class), mock(UpdateRequestProcessor.class), + List.of("field*"), + Collections.emptyList()); + + // Given (doc for update) + SolrInputDocument inputDocument = doc( + f(ID_FIELD, "0000000001"), + f(FIRST_FIELD, "constant to have a constant hash for field1"), + f(SECOND_FIELD, "constant to have a constant hash for field2"), + f(THIRD_FIELD, UUID.randomUUID().toString()), + f(FOURTH_FIELD, "constant to have a constant hash for field4") + ); + + // Then + assertEquals("PozPs2qZQtw=", processor.computeDocHash(inputDocument)); + } + + @Test + public void shouldDedupIncludedFields() { + // Given (processor to include field1 and field2 only) + ContentHashVersionProcessor processorWithDuplicatedFieldName = getContentHashVersionProcessor( + req, + mock(SolrQueryResponse.class), mock(UpdateRequestProcessor.class), + List.of(FIRST_FIELD, FIRST_FIELD, SECOND_FIELD), + Collections.emptyList()); + + ContentHashVersionProcessor processorWithWildcard = getContentHashVersionProcessor( + req, + mock(SolrQueryResponse.class), mock(UpdateRequestProcessor.class), + List.of(SECOND_FIELD, FIRST_FIELD, "field1*"), // Also change order of config (test reorder of field names) + Collections.emptyList()); + + // Given (doc for update) + SolrInputDocument inputDocument = doc( + f(ID_FIELD, "0000000001"), + f(FIRST_FIELD, "constant to have a constant hash for field1"), + f(SECOND_FIELD, "constant to have a constant hash for field2"), + f(THIRD_FIELD, UUID.randomUUID().toString()), + f(FOURTH_FIELD, "constant to have a constant hash for field4") + ); + + // Then + assertEquals("XavrOYGlkXM=", processorWithDuplicatedFieldName.computeDocHash(inputDocument)); + assertEquals("XavrOYGlkXM=", processorWithWildcard.computeDocHash(inputDocument)); + } + + @Test + public void shouldCreateSignatureForNewDoc() throws IOException { + // Given + SolrQueryResponse response = mock(SolrQueryResponse.class); + ContentHashVersionProcessor processor = getContentHashVersionProcessor( + req, + response, + mock(UpdateRequestProcessor.class), + Arrays.asList(FIRST_FIELD, SECOND_FIELD), + Collections.emptyList()); + processor.setDiscardSameDocuments(false); + processor.setOldDocProvider((core, hashField, indexedDocId) -> null); + + // Given (command) + AddUpdateCommand cmd = new AddUpdateCommand(req); + + // Given (doc for update) + SolrInputDocument inputDocument = doc( + f(ID_FIELD, UUID.randomUUID().toString()), + f(FIRST_FIELD, "Values will serve as input to compute a hash"), + f(SECOND_FIELD, "This a constant value for testing include/exclude fields") + ); + cmd.solrDoc = inputDocument; + + // When + processor.processAdd(cmd); + processor.finish(); + + // Then + assertNotNull(inputDocument.getField(HASH_FIELD_NAME)); // signature field got added + assertEquals( + processor.computeDocHash(inputDocument), + inputDocument.getField(HASH_FIELD_NAME).getValue()); // ... and contains expected value + + // Then (asserts on hash comparison results) + verify(response, times(1)).addToLog(eq("numAddsExisting"), eq(0)); + verify(response, times(1)).addToLog(eq("numAddsExistingWithIdentical"), eq(0)); // And no hash clash with old doc + } + + @Test + public void shouldAddToResponseLog() throws IOException { + // Given + SolrQueryResponse response = mock(SolrQueryResponse.class); + ContentHashVersionProcessor processor = getContentHashVersionProcessor( + req, + response, + mock(UpdateRequestProcessor.class), + Arrays.asList(FIRST_FIELD, SECOND_FIELD), + Collections.emptyList()); + processor.setDiscardSameDocuments(false); + + // Given (command to update existing doc) + AddUpdateCommand cmdDoesNotChangeValues = new AddUpdateCommand(req); + + // Given (doc for update - matches the existing doc, see getContentHashVersionProcessor()) + SolrInputDocument initialDocument = doc( + f(ID_FIELD, UUID.randomUUID().toString()), + f(FIRST_FIELD, "Initial values used to compute initial hash"), + f(SECOND_FIELD, "This a constant value for testing include/exclude fields") + ); + cmdDoesNotChangeValues.solrDoc = doc( + f(ID_FIELD, initialDocument.getFieldValue(ID_FIELD)), + f(FIRST_FIELD, initialDocument.getFieldValue(FIRST_FIELD)), + f(SECOND_FIELD, initialDocument.getFieldValue(SECOND_FIELD)), + f(HASH_FIELD_NAME, processor.computeDocHash(initialDocument)) + ); + + // Given (command to update existing doc with different content) + AddUpdateCommand cmdChangesDocValues = new AddUpdateCommand(req); + + // Given (doc for update - does *not* match the existing doc, see getContentHashVersionProcessor()) + cmdChangesDocValues.solrDoc = doc( + f(ID_FIELD, UUID.randomUUID().toString()), + f(FIRST_FIELD, "This is a doc with values"), + f(SECOND_FIELD, "that differs from stored doc, so it's considered new") + ); + + // When + processor.processAdd(cmdDoesNotChangeValues); + processor.processAdd(cmdChangesDocValues); + processor.finish(); + + // Then (read as follows: 2 updates occurred for an existing doc. Among these updates, 1 update tried to replace + // doc with the same content) + verify(response, times(1)).addToLog(eq("numAddsExisting"), eq(2)); + verify(response, times(1)).addToLog(eq("numAddsExistingWithIdentical"), eq(1)); + } + + @Test + public void shouldNotUpdateSignatureForNewDoc() throws IOException { + // Given + SolrQueryResponse response = mock(SolrQueryResponse.class); + ContentHashVersionProcessor processor = getContentHashVersionProcessor( + req, + response, + mock(UpdateRequestProcessor.class), + List.of(SECOND_FIELD), + Collections.emptyList()); + + // Given (command) + AddUpdateCommand cmd = new AddUpdateCommand(req); + + // Given (doc for update) + SolrInputDocument inputDocument = doc( + f(ID_FIELD, UUID.randomUUID().toString()), + f(FIRST_FIELD, "Values will serve as input to compute a hash"), + f(SECOND_FIELD, "This a constant value for testing include/exclude fields") + ); + cmd.solrDoc = inputDocument; + + // When + processor.processAdd(cmd); + + // Then + assertNotNull(inputDocument.getField(HASH_FIELD_NAME)); // signature field got added + assertEquals( + processor.computeDocHash(inputDocument), + inputDocument.getField(HASH_FIELD_NAME).getValue()); // ... and contains expected value + verify(response, never()).addToLog(eq("numAddsExisting"), eq(0)); + verify(response, never()).addToLog(eq("numAddsExistingWithIdentical"), eq(0)); + } + + @Test + public void shouldExcludeFieldsUpdateSignatureForNewDoc() throws IOException { + // Given + SolrQueryResponse response = mock(SolrQueryResponse.class); + ContentHashVersionProcessor processor = getContentHashVersionProcessor( + req, + response, + mock(UpdateRequestProcessor.class), + List.of(FIRST_FIELD, SECOND_FIELD), + List.of(FIRST_FIELD)); + processor.setDiscardSameDocuments(false); + + // Given (command) + AddUpdateCommand cmd = new AddUpdateCommand(req); + + // Given (doc for update) + SolrInputDocument inputDocument = doc( + f(ID_FIELD, UUID.randomUUID().toString()), + f(FIRST_FIELD, "Values will serve as input to compute a hash"), + f(SECOND_FIELD, "This a constant value for testing include/exclude fields") + ); + cmd.solrDoc = inputDocument; + + // When + processor.processAdd(cmd); + + // Then + assertNotNull(inputDocument.getField(HASH_FIELD_NAME)); // signature field got added + assertEquals( + processor.computeDocHash(inputDocument), + inputDocument.getField(HASH_FIELD_NAME).getValue()); // ... and contains expected value + verify(response, never()).addToLog(eq("numAddsExisting"), eq(1)); + verify(response, never()).addToLog(eq("numAddsExistingWithIdentical"), eq(0)); + } + + @Test + public void shouldCommitWithDiscardModeEnabled() throws IOException { + // Given + UpdateRequestProcessor nextProcessor = mock(UpdateRequestProcessor.class); + ContentHashVersionProcessor processor = getContentHashVersionProcessor( + req, + mock(SolrQueryResponse.class), + nextProcessor, + List.of(FIRST_FIELD, SECOND_FIELD), + List.of(FIRST_FIELD)); + processor.setDiscardSameDocuments(true); + + // Given (command to update existing doc) + AddUpdateCommand cmdDoesNotChangeValues = new AddUpdateCommand(req); + + // Given (doc for update - matches the existing doc, see getContentHashVersionProcessor()) + SolrInputDocument initialDocument = doc( + f(ID_FIELD, UUID.randomUUID().toString()), + f(FIRST_FIELD, "Initial values used to compute initial hash"), + f(SECOND_FIELD, "This a constant value for testing include/exclude fields") + ); + cmdDoesNotChangeValues.solrDoc = doc( + f(ID_FIELD, initialDocument.getFieldValue(ID_FIELD)), + f(FIRST_FIELD, initialDocument.getFieldValue(FIRST_FIELD)), + f(SECOND_FIELD, initialDocument.getFieldValue(SECOND_FIELD)), + f(HASH_FIELD_NAME, processor.computeDocHash(initialDocument)) + ); + + // When + processor.processAdd(cmdDoesNotChangeValues); + + // Then + verify(nextProcessor, never()).processAdd(eq(cmdDoesNotChangeValues)); + } + +} From 527765262aa5081bd1793e61f552448959e2709b Mon Sep 17 00:00:00 2001 From: fhuaulme Date: Fri, 3 Apr 2026 17:42:07 +0200 Subject: [PATCH 2/6] * lint fixes --- .../ContentHashVersionProcessor.java | 344 ++++++++------- .../ContentHashVersionProcessorFactory.java | 99 +++-- ...ontentHashVersionProcessorFactoryTest.java | 51 ++- .../ContentHashVersionProcessorTest.java | 400 ++++++++++-------- 4 files changed, 519 insertions(+), 375 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java b/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java index 5127a6453f2..f4b40221a1b 100644 --- a/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java +++ b/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java @@ -1,5 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.solr.update.processor; +import java.io.IOException; +import java.lang.invoke.MethodHandles; +import java.util.Base64; +import java.util.Collection; +import java.util.List; +import java.util.Objects; +import java.util.Set; +import java.util.function.Predicate; import org.apache.lucene.util.BytesRef; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrInputDocument; @@ -15,182 +40,197 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.IOException; -import java.lang.invoke.MethodHandles; -import java.util.Base64; -import java.util.Collection; -import java.util.List; -import java.util.Objects; -import java.util.Set; -import java.util.function.Predicate; - /** - * An implementation of {@link UpdateRequestProcessor} which computes a hash of selected doc values, and uses this hash - * value to reject/accept doc updates. + * An implementation of {@link UpdateRequestProcessor} which computes a hash of selected doc values, + * and uses this hash value to reject/accept doc updates. + * *
    - *
  • When no corresponding doc with same id exists (create), computed hash is added to the document.
  • - *
  • When a previous doc exists (update), a new hash is computed using new version values and compared with old hash.
  • + *
  • When no corresponding doc with same id exists (create), computed hash is added to the + * document. + *
  • When a previous doc exists (update), a new hash is computed using new version values and + * compared with old hash. *
- * Depending on {#discardSameDocuments} value, this processor may reject or accept doc update. - * This implementation can be used for monitoring or rejecting no-op updates (updates that do not change Solr document). - *

- * Note: hash is computed using {@link Lookup3Signature}. - *

+ * + * Depending on {#discardSameDocuments} value, this processor may reject or accept doc update. This + * implementation can be used for monitoring or rejecting no-op updates (updates that do not change + * Solr document). + * + *

Note: hash is computed using {@link Lookup3Signature}. + * * @see Lookup3Signature */ public class ContentHashVersionProcessor extends UpdateRequestProcessor { - private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - private final SchemaField hashField; - private final SolrQueryResponse rsp; - private final SolrCore core; - private final Predicate includedFields; // Matcher for included fields in hash - private final Predicate excludedFields; // Matcher for excluded fields from hash - private OldDocProvider oldDocProvider = new DefaultDocProvider(); - private boolean discardSameDocuments; - private int sameCount = 0; - private int differentCount = 0; - private int unknownCount = 0; - - public ContentHashVersionProcessor(Predicate hashIncludedFields, Predicate hashExcludedFields, String hashFieldName, SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor next) { - super(next); - this.core = req.getCore(); - this.hashField = new SchemaField(hashFieldName, new TextField()); - this.rsp = rsp; - this.includedFields = hashIncludedFields; - this.excludedFields = hashExcludedFields; + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private final SchemaField hashField; + private final SolrQueryResponse rsp; + private final SolrCore core; + private final Predicate includedFields; // Matcher for included fields in hash + private final Predicate excludedFields; // Matcher for excluded fields from hash + private OldDocProvider oldDocProvider = new DefaultDocProvider(); + private boolean discardSameDocuments; + private int sameCount = 0; + private int differentCount = 0; + private int unknownCount = 0; + + public ContentHashVersionProcessor( + Predicate hashIncludedFields, + Predicate hashExcludedFields, + String hashFieldName, + SolrQueryRequest req, + SolrQueryResponse rsp, + UpdateRequestProcessor next) { + super(next); + this.core = req.getCore(); + this.hashField = new SchemaField(hashFieldName, new TextField()); + this.rsp = rsp; + this.includedFields = hashIncludedFields; + this.excludedFields = hashExcludedFields; + } + + public void processAdd(AddUpdateCommand cmd) throws IOException { + SolrInputDocument newDoc = cmd.getSolrInputDocument(); + String newHash = computeDocHash(newDoc); + newDoc.setField(hashField.getName(), newHash); + int i = 0; + + if (!validateHash(cmd.getIndexedId(), newHash)) { + return; } - public void processAdd(AddUpdateCommand cmd) throws IOException { - SolrInputDocument newDoc = cmd.getSolrInputDocument(); - String newHash = computeDocHash(newDoc); - newDoc.setField(hashField.getName(), newHash); - int i = 0; - - if (!validateHash(cmd.getIndexedId(), newHash)) { - return; - } - - while (true) { - logOverlyFailedRetries(i, cmd); - try { - super.processAdd(cmd); - return; - } catch (SolrException e) { - if (e.code() != 409) { - throw e; - } - ++i; - } + while (true) { + logOverlyFailedRetries(i, cmd); + try { + super.processAdd(cmd); + return; + } catch (SolrException e) { + if (e.code() != 409) { + throw e; } + ++i; + } } - - @Override - public void finish() throws IOException { - try { - super.finish(); - } finally { - rsp.addToLog("numAddsExisting", sameCount + differentCount + unknownCount); - rsp.addToLog("numAddsExistingWithIdentical", sameCount); - rsp.addToLog("numAddsExistingUnknown", unknownCount); - } + } + + @Override + public void finish() throws IOException { + try { + super.finish(); + } finally { + rsp.addToLog("numAddsExisting", sameCount + differentCount + unknownCount); + rsp.addToLog("numAddsExistingWithIdentical", sameCount); + rsp.addToLog("numAddsExistingUnknown", unknownCount); } + } - private static void logOverlyFailedRetries(int i, UpdateCommand cmd) { - if ((i & 255) == 255) { - log.warn("Unusual number of optimistic concurrency retries: retries={} cmd={}", i, cmd); - } + private static void logOverlyFailedRetries(int i, UpdateCommand cmd) { + if ((i & 255) == 255) { + log.warn("Unusual number of optimistic concurrency retries: retries={} cmd={}", i, cmd); } - - void setOldDocProvider(OldDocProvider oldDocProvider) { - this.oldDocProvider = oldDocProvider; + } + + void setOldDocProvider(OldDocProvider oldDocProvider) { + this.oldDocProvider = oldDocProvider; + } + + void setDiscardSameDocuments(boolean discardSameDocuments) { + this.discardSameDocuments = discardSameDocuments; + } + + private boolean validateHash(BytesRef indexedDocId, String newHash) throws IOException { + assert null != indexedDocId; + + var docFoundAndOldUserVersions = getOldUserVersionsFromStored(indexedDocId); + if (docFoundAndOldUserVersions.found) { + String oldHash = + docFoundAndOldUserVersions.oldHash; // No hash: might want to keep track of these too + if (oldHash == null) { + unknownCount++; + return true; + } else if (Objects.equals(newHash, oldHash)) { + sameCount++; + return !discardSameDocuments; + } else { + differentCount++; + return true; + } } - - void setDiscardSameDocuments(boolean discardSameDocuments) { - this.discardSameDocuments = discardSameDocuments; + return true; // Doc not found + } + + private DocFoundAndOldUserAndSolrVersions getOldUserVersionsFromStored(BytesRef indexedDocId) + throws IOException { + SolrInputDocument oldDoc = oldDocProvider.getDocument(core, hashField.getName(), indexedDocId); + return null == oldDoc + ? DocFoundAndOldUserAndSolrVersions.NOT_FOUND + : getUserVersionAndSolrVersionFromDocument(oldDoc); + } + + private DocFoundAndOldUserAndSolrVersions getUserVersionAndSolrVersionFromDocument( + SolrInputDocument oldDoc) { + Object o = oldDoc.getFieldValue(hashField.getName()); + if (o != null) { + return new DocFoundAndOldUserAndSolrVersions(o.toString()); } - - private boolean validateHash(BytesRef indexedDocId, String newHash) throws IOException { - assert null != indexedDocId; - - var docFoundAndOldUserVersions = getOldUserVersionsFromStored(indexedDocId); - if (docFoundAndOldUserVersions.found) { - String oldHash = docFoundAndOldUserVersions.oldHash; // No hash: might want to keep track of these too - if (oldHash == null) { - unknownCount++; - return true; - } else if (Objects.equals(newHash, oldHash)) { - sameCount++; - return !discardSameDocuments; - } else { - differentCount++; - return true; - } + return new DocFoundAndOldUserAndSolrVersions(); + } + + public String computeDocHash(SolrInputDocument doc) { + List docIncludedFieldNames = + doc.getFieldNames().stream() + .filter(includedFields) // Keep fields that match 'included fields' matcher... + .filter( + excludedFields + .negate()) // ...and exclude fields that match 'excluded fields' matcher + .sorted() // Sort to ensure consistent field order across different doc field orders + .toList(); + + final Signature sig = new Lookup3Signature(); + for (String fieldName : docIncludedFieldNames) { + sig.add(fieldName); + Object o = doc.getFieldValue(fieldName); + if (o instanceof Collection) { + for (Object oo : (Collection) o) { + sig.add(String.valueOf(oo)); } - return true; // Doc not found + } else { + sig.add(String.valueOf(o)); + } } - private DocFoundAndOldUserAndSolrVersions getOldUserVersionsFromStored(BytesRef indexedDocId) throws IOException { - SolrInputDocument oldDoc = oldDocProvider.getDocument(core, hashField.getName(), indexedDocId); - return null == oldDoc ? DocFoundAndOldUserAndSolrVersions.NOT_FOUND : getUserVersionAndSolrVersionFromDocument(oldDoc); - } + // Signature, depending on implementation, may return 8-byte or 16-byte value + byte[] signature = sig.getSignature(); + return Base64.getEncoder() + .encodeToString(signature); // Makes a base64 hash out of signature value + } - private DocFoundAndOldUserAndSolrVersions getUserVersionAndSolrVersionFromDocument(SolrInputDocument oldDoc) { - Object o = oldDoc.getFieldValue(hashField.getName()); - if (o != null) { - return new DocFoundAndOldUserAndSolrVersions(o.toString()); - } - return new DocFoundAndOldUserAndSolrVersions(); - } + interface OldDocProvider { + SolrInputDocument getDocument(SolrCore core, String hashField, BytesRef indexedDocId) + throws IOException; + } - public String computeDocHash(SolrInputDocument doc) { - List docIncludedFieldNames = doc.getFieldNames().stream() - .filter(includedFields) // Keep fields that match 'included fields' matcher... - .filter(excludedFields.negate()) // ...and exclude fields that match 'excluded fields' matcher - .sorted() // Sort to ensure consistent field order across different doc field orders - .toList(); - - final Signature sig = new Lookup3Signature(); - for (String fieldName : docIncludedFieldNames) { - sig.add(fieldName); - Object o = doc.getFieldValue(fieldName); - if (o instanceof Collection) { - for (Object oo : (Collection) o) { - sig.add(String.valueOf(oo)); - } - } else { - sig.add(String.valueOf(o)); - } - } - - // Signature, depending on implementation, may return 8-byte or 16-byte value - byte[] signature = sig.getSignature(); - return Base64.getEncoder().encodeToString(signature); // Makes a base64 hash out of signature value - } - - interface OldDocProvider { - SolrInputDocument getDocument(SolrCore core, String hashField, BytesRef indexedDocId) throws IOException; - } - - private static class DefaultDocProvider implements OldDocProvider { - @Override - public SolrInputDocument getDocument(SolrCore core, String hashField, BytesRef indexedDocId) throws IOException { - return RealTimeGetComponent.getInputDocument(core, indexedDocId, indexedDocId, null, Set.of(hashField), Resolution.PARTIAL); - } + private static class DefaultDocProvider implements OldDocProvider { + @Override + public SolrInputDocument getDocument(SolrCore core, String hashField, BytesRef indexedDocId) + throws IOException { + return RealTimeGetComponent.getInputDocument( + core, indexedDocId, indexedDocId, null, Set.of(hashField), Resolution.PARTIAL); } + } - private static class DocFoundAndOldUserAndSolrVersions { - private static final DocFoundAndOldUserAndSolrVersions NOT_FOUND = new DocFoundAndOldUserAndSolrVersions(); - private final boolean found; + private static class DocFoundAndOldUserAndSolrVersions { + private static final DocFoundAndOldUserAndSolrVersions NOT_FOUND = + new DocFoundAndOldUserAndSolrVersions(); + private final boolean found; - public String oldHash; + public String oldHash; - private DocFoundAndOldUserAndSolrVersions() { - this.found = false; - } - private DocFoundAndOldUserAndSolrVersions(String oldHash) { - this.found = true; - this.oldHash = oldHash; - } + private DocFoundAndOldUserAndSolrVersions() { + this.found = false; + } + private DocFoundAndOldUserAndSolrVersions(String oldHash) { + this.found = true; + this.oldHash = oldHash; } + } } diff --git a/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessorFactory.java b/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessorFactory.java index a59c98e5882..48fa52db928 100644 --- a/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessorFactory.java +++ b/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessorFactory.java @@ -1,5 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.solr.update.processor; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Locale; +import java.util.function.Predicate; +import java.util.stream.Collectors; import org.apache.solr.common.SolrException; import org.apache.solr.common.util.NamedList; import org.apache.solr.common.util.StrUtils; @@ -8,44 +31,40 @@ import org.apache.solr.response.SolrQueryResponse; import org.apache.solr.util.plugin.SolrCoreAware; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.function.Predicate; -import java.util.stream.Collectors; - -/** - * Factory for {@link ContentHashVersionProcessor} instances. - */ -public class ContentHashVersionProcessorFactory extends UpdateRequestProcessorFactory implements SolrCoreAware, UpdateRequestProcessorFactory.RunAlways { +/** Factory for {@link ContentHashVersionProcessor} instances. */ +public class ContentHashVersionProcessorFactory extends UpdateRequestProcessorFactory + implements SolrCoreAware, UpdateRequestProcessorFactory.RunAlways { private static final char SEPARATOR = ','; // Separator for included/excluded fields static final String CONTENT_HASH_ENABLED_PARAM = "contentHashEnabled"; - private List includeFields = Collections.singletonList("*"); // Included fields defaults to 'all' - private List excludeFields = new ArrayList<>(); // No excluded field by default, yet hashFieldName is excluded by default - private String hashFieldName = "content_hash"; // Field name to store computed hash on create/update operations + private List includeFields = + Collections.singletonList("*"); // Included fields defaults to 'all' + private List excludeFields = new ArrayList<>(); + // No excluded field by default, yet hashFieldName is excluded by default + private String hashFieldName = + "content_hash"; // Field name to store computed hash on create/update operations private boolean discardSameDocuments = true; - public ContentHashVersionProcessorFactory() { - } + public ContentHashVersionProcessorFactory() {} public void init(NamedList args) { Object tmp = args.remove("includeFields"); if (tmp != null) { if (!(tmp instanceof String)) { - throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, - "'includeFields' must be configured as a "); + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, "'includeFields' must be configured as a "); } // Include fields support comma separated list of fields (e.g. "field1,field2,field3"). // Also supports "*" to include all fields - this.includeFields = StrUtils.splitSmart((String) tmp, SEPARATOR) - .stream() + this.includeFields = + StrUtils.splitSmart((String) tmp, SEPARATOR).stream() .map(String::trim) .collect(Collectors.toList()); } tmp = args.remove("hashFieldName"); if (tmp != null) { if (!(tmp instanceof String)) { - throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "'hashFieldName' must be configured as a "); + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, "'hashFieldName' must be configured as a "); } this.hashFieldName = (String) tmp; } @@ -53,15 +72,18 @@ public void init(NamedList args) { tmp = args.remove("excludeFields"); if (tmp != null) { if (!(tmp instanceof String)) { - throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "'excludeFields' must be configured as a "); + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, "'excludeFields' must be configured as a "); } if ("*".equals(((String) tmp).trim())) { - throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "'excludeFields' can't exclude all fields."); + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, "'excludeFields' can't exclude all fields."); } - // Exclude fields support comma separated list of fields (e.g. "excluded_field1,excluded_field2"). + // Exclude fields support comma separated list of fields (e.g. + // "excluded_field1,excluded_field2"). // Also supports "*" to exclude all fields - this.excludeFields = StrUtils.splitSmart((String) tmp, SEPARATOR) - .stream() + this.excludeFields = + StrUtils.splitSmart((String) tmp, SEPARATOR).stream() .map(String::trim) .collect(Collectors.toList()); } @@ -70,27 +92,35 @@ public void init(NamedList args) { tmp = args.remove("hashCompareStrategy"); if (tmp != null) { if (!(tmp instanceof String)) { - throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "'hashCompareStrategy' must be configured as a "); + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, + "'hashCompareStrategy' must be configured as a "); } - String value = ((String) tmp).toLowerCase(); + String value = ((String) tmp).toLowerCase(Locale.ROOT); if ("discard".equalsIgnoreCase(value)) { discardSameDocuments = true; } else if ("log".equalsIgnoreCase(value)) { discardSameDocuments = false; } else { - throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Value '" + value + "' is unsupported for 'hashCompareStrategy', only 'discard' and 'log' are supported."); + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, + "Value '" + + value + + "' is unsupported for 'hashCompareStrategy', only 'discard' and 'log' are supported."); } } super.init(args); } - public UpdateRequestProcessor getInstance(SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor next) { + public UpdateRequestProcessor getInstance( + SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor next) { if (!req.getParams().getBool(CONTENT_HASH_ENABLED_PARAM, false)) { return next; } - ContentHashVersionProcessor processor = new ContentHashVersionProcessor( + ContentHashVersionProcessor processor = + new ContentHashVersionProcessor( buildFieldMatcher(includeFields), buildFieldMatcher(excludeFields), hashFieldName, @@ -103,7 +133,8 @@ public UpdateRequestProcessor getInstance(SolrQueryRequest req, SolrQueryRespons public void inform(SolrCore core) { if (core.getLatestSchema().getUniqueKeyField() == null) { - throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "schema must have uniqueKey defined."); + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, "schema must have uniqueKey defined."); } } @@ -120,7 +151,7 @@ public List getExcludeFields() { } public boolean discardSameDocuments() { - return discardSameDocuments; + return discardSameDocuments; } static Predicate buildFieldMatcher(List fieldNames) { @@ -133,8 +164,8 @@ static Predicate buildFieldMatcher(List fieldNames) { return true; } if (currentFieldName.length() > 1 - && currentFieldName.endsWith("*") - && fieldName.startsWith(currentFieldName.substring(0, currentFieldName.length() - 1))) { + && currentFieldName.endsWith("*") + && fieldName.startsWith(currentFieldName.substring(0, currentFieldName.length() - 1))) { return true; } } diff --git a/solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorFactoryTest.java b/solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorFactoryTest.java index 51af18a075d..2f2e920ac6d 100644 --- a/solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorFactoryTest.java +++ b/solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorFactoryTest.java @@ -1,22 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.solr.update.processor; +import static org.apache.solr.SolrTestCaseJ4.assumeWorkingMockito; +import static org.apache.solr.update.processor.ContentHashVersionProcessorFactory.CONTENT_HASH_ENABLED_PARAM; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotEquals; +import static org.junit.Assert.assertTrue; +import static org.mockito.ArgumentMatchers.anyBoolean; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import java.util.List; import org.apache.solr.common.SolrException; import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.util.NamedList; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.response.SolrQueryResponse; +import org.junit.BeforeClass; import org.junit.Test; -import java.util.List; - -import static org.apache.solr.update.processor.ContentHashVersionProcessorFactory.CONTENT_HASH_ENABLED_PARAM; -import static org.junit.Assert.*; -import static org.mockito.ArgumentMatchers.*; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.when; - public class ContentHashVersionProcessorFactoryTest { + @BeforeClass + public static void beforeClass() throws Exception { + assumeWorkingMockito(); + } + @Test public void shouldHaveSensibleDefaultValues() { ContentHashVersionProcessorFactory factory = new ContentHashVersionProcessorFactory(); @@ -120,7 +146,8 @@ public void shouldDisableContentHashByQueryParameter() { UpdateRequestProcessor next = mock(UpdateRequestProcessor.class); SolrQueryRequest updateRequest = createUpdateRequest(false); // Request disables processor - UpdateRequestProcessor instance = factory.getInstance(updateRequest, mock(SolrQueryResponse.class), next); + UpdateRequestProcessor instance = + factory.getInstance(updateRequest, mock(SolrQueryResponse.class), next); assertEquals(instance, next); } @@ -131,7 +158,8 @@ public void shouldEnableContentHashByQueryParameter() { UpdateRequestProcessor next = mock(UpdateRequestProcessor.class); SolrQueryRequest updateRequest = createUpdateRequest(true); // Request enables processor - UpdateRequestProcessor instance = factory.getInstance(updateRequest, mock(SolrQueryResponse.class), next); + UpdateRequestProcessor instance = + factory.getInstance(updateRequest, mock(SolrQueryResponse.class), next); assertNotEquals(instance, next); } @@ -139,7 +167,8 @@ public void shouldEnableContentHashByQueryParameter() { private static SolrQueryRequest createUpdateRequest(boolean enableContentHashParamValue) { SolrQueryRequest req = mock(SolrQueryRequest.class); SolrParams solrParams = mock(SolrParams.class); - when(solrParams.getBool(eq(CONTENT_HASH_ENABLED_PARAM), anyBoolean())).thenReturn(enableContentHashParamValue); + when(solrParams.getBool(eq(CONTENT_HASH_ENABLED_PARAM), anyBoolean())) + .thenReturn(enableContentHashParamValue); when(req.getParams()).thenReturn(solrParams); return req; diff --git a/solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorTest.java b/solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorTest.java index 1f399816ce7..92ea8282f9e 100644 --- a/solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorTest.java +++ b/solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorTest.java @@ -1,5 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.solr.update.processor; +import static org.mockito.ArgumentMatchers.anyString; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.UUID; import org.apache.lucene.util.BytesRef; import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.params.SolrParams; @@ -10,17 +38,9 @@ import org.apache.solr.schema.UUIDField; import org.apache.solr.update.AddUpdateCommand; import org.junit.Before; +import org.junit.BeforeClass; import org.junit.Test; -import java.io.IOException; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; -import java.util.UUID; - -import static org.mockito.ArgumentMatchers.anyString; -import static org.mockito.Mockito.*; - public class ContentHashVersionProcessorTest extends UpdateProcessorTestBase { public static final String ID_FIELD = "_id"; @@ -37,31 +57,37 @@ private ContentHashVersionProcessor getContentHashVersionProcessor( UpdateRequestProcessor next, List includedFields, List excludedFields) { - ContentHashVersionProcessor processor = new ContentHashVersionProcessor( - ContentHashVersionProcessorFactory.buildFieldMatcher(includedFields), - ContentHashVersionProcessorFactory.buildFieldMatcher(excludedFields), - HASH_FIELD_NAME, - req, - rsp, - next); + ContentHashVersionProcessor processor = + new ContentHashVersionProcessor( + ContentHashVersionProcessorFactory.buildFieldMatcher(includedFields), + ContentHashVersionProcessorFactory.buildFieldMatcher(excludedFields), + HASH_FIELD_NAME, + req, + rsp, + next); // Given (previous doc retrieval configuration) - processor.setOldDocProvider((core, hashField, indexedDocId) -> { - final SolrInputDocument inputDocument = doc( - f(ID_FIELD, indexedDocId.utf8ToString()), - f(FIRST_FIELD, "Initial values used to compute initial hash"), - f(SECOND_FIELD, "This a constant value for testing include/exclude fields") - ); - return doc( - f(ID_FIELD, inputDocument.getFieldValue(ID_FIELD)), - f(FIRST_FIELD, inputDocument.getFieldValue(FIRST_FIELD)), - f(SECOND_FIELD, inputDocument.getFieldValue(SECOND_FIELD)), - f(hashField, processor.computeDocHash(inputDocument)) - ); - }); + processor.setOldDocProvider( + (core, hashField, indexedDocId) -> { + final SolrInputDocument inputDocument = + doc( + f(ID_FIELD, indexedDocId.utf8ToString()), + f(FIRST_FIELD, "Initial values used to compute initial hash"), + f(SECOND_FIELD, "This a constant value for testing include/exclude fields")); + return doc( + f(ID_FIELD, inputDocument.getFieldValue(ID_FIELD)), + f(FIRST_FIELD, inputDocument.getFieldValue(FIRST_FIELD)), + f(SECOND_FIELD, inputDocument.getFieldValue(SECOND_FIELD)), + f(hashField, processor.computeDocHash(inputDocument))); + }); return processor; } + @BeforeClass + public static void beforeClass() throws Exception { + assumeWorkingMockito(); + } + @Before public void setUp() throws Exception { super.setUp(); @@ -74,36 +100,37 @@ public void setUp() throws Exception { IndexSchema indexSchema = mock(IndexSchema.class); when(req.getSchema()).thenReturn(indexSchema); when(indexSchema.getUniqueKeyField()).thenReturn(new SchemaField(ID_FIELD, new UUIDField())); - when(indexSchema.indexableUniqueKey(anyString())).then(invocationOnMock -> new BytesRef(invocationOnMock.getArgument( - 0).toString())); + when(indexSchema.indexableUniqueKey(anyString())) + .then(invocationOnMock -> new BytesRef(invocationOnMock.getArgument(0).toString())); } @Test public void shouldComputeHashForDoc() { // Given - ContentHashVersionProcessor processor = getContentHashVersionProcessor( - req, - mock(SolrQueryResponse.class), - mock(UpdateRequestProcessor.class), - List.of("*"), - List.of(ID_FIELD)); + ContentHashVersionProcessor processor = + getContentHashVersionProcessor( + req, + mock(SolrQueryResponse.class), + mock(UpdateRequestProcessor.class), + List.of("*"), + List.of(ID_FIELD)); // Given (doc for update) - SolrInputDocument inputDocument1 = doc( - f(ID_FIELD, UUID.randomUUID().toString()), - f(FIRST_FIELD, "Values will serve as input to compute a hash"), - f(SECOND_FIELD, "This a constant value for testing include/exclude fields") - ); + SolrInputDocument inputDocument1 = + doc( + f(ID_FIELD, UUID.randomUUID().toString()), + f(FIRST_FIELD, "Values will serve as input to compute a hash"), + f(SECOND_FIELD, "This a constant value for testing include/exclude fields")); // Then assertEquals("Tak0G5a/DIE=", processor.computeDocHash(inputDocument1)); // Given (doc for update - with different order) - SolrInputDocument inputDocument2 = doc( - f(ID_FIELD, UUID.randomUUID().toString()), - f(SECOND_FIELD, "This a constant value for testing include/exclude fields"), - f(FIRST_FIELD, "Values will serve as input to compute a hash") - ); + SolrInputDocument inputDocument2 = + doc( + f(ID_FIELD, UUID.randomUUID().toString()), + f(SECOND_FIELD, "This a constant value for testing include/exclude fields"), + f(FIRST_FIELD, "Values will serve as input to compute a hash")); // Then (hash remain same, since id is excluded from signature fields) assertEquals("Tak0G5a/DIE=", processor.computeDocHash(inputDocument2)); @@ -112,42 +139,47 @@ public void shouldComputeHashForDoc() { @Test public void shouldUseExcludedFieldsWildcard() { // Given - ContentHashVersionProcessor processor = getContentHashVersionProcessor( - req, - mock(SolrQueryResponse.class), mock(UpdateRequestProcessor.class), - List.of("*"), - List.of("field*")); + ContentHashVersionProcessor processor = + getContentHashVersionProcessor( + req, + mock(SolrQueryResponse.class), + mock(UpdateRequestProcessor.class), + List.of("*"), + List.of("field*")); // Given (doc for update) - SolrInputDocument inputDocument = doc( - f(ID_FIELD, "0000000001"), - f(FIRST_FIELD, UUID.randomUUID().toString()), - f(SECOND_FIELD, UUID.randomUUID().toString()), - f(THIRD_FIELD, "constant to have a constant hash"), - f(FOURTH_FIELD, UUID.randomUUID().toString()) - ); + SolrInputDocument inputDocument = + doc( + f(ID_FIELD, "0000000001"), + f(FIRST_FIELD, UUID.randomUUID().toString()), + f(SECOND_FIELD, UUID.randomUUID().toString()), + f(THIRD_FIELD, "constant to have a constant hash"), + f(FOURTH_FIELD, UUID.randomUUID().toString())); // Then (only ID and THIRD_FIELD is used in hash, other fields contain random values) - assertEquals("bwE8Zjq0aOs=", processor.computeDocHash(inputDocument)); // Hash if only ID field was used + assertEquals( + "bwE8Zjq0aOs=", processor.computeDocHash(inputDocument)); // Hash if only ID field was used } @Test public void shouldUseIncludedFieldsWildcard() { // Given - ContentHashVersionProcessor processor = getContentHashVersionProcessor( - req, - mock(SolrQueryResponse.class), mock(UpdateRequestProcessor.class), - List.of("field*"), - List.of(THIRD_FIELD)); + ContentHashVersionProcessor processor = + getContentHashVersionProcessor( + req, + mock(SolrQueryResponse.class), + mock(UpdateRequestProcessor.class), + List.of("field*"), + List.of(THIRD_FIELD)); // Given (doc for update) - SolrInputDocument inputDocument = doc( - f(ID_FIELD, "0000000001"), - f(FIRST_FIELD, "constant to have a constant hash for field1"), - f(SECOND_FIELD, "constant to have a constant hash for field2"), - f(THIRD_FIELD, UUID.randomUUID().toString()), - f(FOURTH_FIELD, "constant to have a constant hash for field4") - ); + SolrInputDocument inputDocument = + doc( + f(ID_FIELD, "0000000001"), + f(FIRST_FIELD, "constant to have a constant hash for field1"), + f(SECOND_FIELD, "constant to have a constant hash for field2"), + f(THIRD_FIELD, UUID.randomUUID().toString()), + f(FOURTH_FIELD, "constant to have a constant hash for field4")); // Then assertEquals("PozPs2qZQtw=", processor.computeDocHash(inputDocument)); @@ -156,20 +188,22 @@ public void shouldUseIncludedFieldsWildcard() { @Test public void shouldUseIncludedFieldsWildcard2() { // Given (variant of previous shouldUseIncludedFieldsWildcard, without the excludedField config) - ContentHashVersionProcessor processor = getContentHashVersionProcessor( - req, - mock(SolrQueryResponse.class), mock(UpdateRequestProcessor.class), - List.of("field*"), - Collections.emptyList()); + ContentHashVersionProcessor processor = + getContentHashVersionProcessor( + req, + mock(SolrQueryResponse.class), + mock(UpdateRequestProcessor.class), + List.of("field*"), + List.of()); // Given (doc for update) - SolrInputDocument inputDocument = doc( - f(ID_FIELD, "0000000001"), - f(FIRST_FIELD, "constant to have a constant hash for field1"), - f(SECOND_FIELD, "constant to have a constant hash for field2"), - f(THIRD_FIELD, UUID.randomUUID().toString()), - f(FOURTH_FIELD, "constant to have a constant hash for field4") - ); + SolrInputDocument inputDocument = + doc( + f(ID_FIELD, "0000000001"), + f(FIRST_FIELD, "constant to have a constant hash for field1"), + f(SECOND_FIELD, "constant to have a constant hash for field2"), + f(THIRD_FIELD, UUID.randomUUID().toString()), + f(FOURTH_FIELD, "constant to have a constant hash for field4")); // Then assertEquals("PozPs2qZQtw=", processor.computeDocHash(inputDocument)); @@ -178,26 +212,33 @@ public void shouldUseIncludedFieldsWildcard2() { @Test public void shouldDedupIncludedFields() { // Given (processor to include field1 and field2 only) - ContentHashVersionProcessor processorWithDuplicatedFieldName = getContentHashVersionProcessor( - req, - mock(SolrQueryResponse.class), mock(UpdateRequestProcessor.class), - List.of(FIRST_FIELD, FIRST_FIELD, SECOND_FIELD), - Collections.emptyList()); - - ContentHashVersionProcessor processorWithWildcard = getContentHashVersionProcessor( - req, - mock(SolrQueryResponse.class), mock(UpdateRequestProcessor.class), - List.of(SECOND_FIELD, FIRST_FIELD, "field1*"), // Also change order of config (test reorder of field names) - Collections.emptyList()); + ContentHashVersionProcessor processorWithDuplicatedFieldName = + getContentHashVersionProcessor( + req, + mock(SolrQueryResponse.class), + mock(UpdateRequestProcessor.class), + List.of(FIRST_FIELD, FIRST_FIELD, SECOND_FIELD), + List.of()); + + ContentHashVersionProcessor processorWithWildcard = + getContentHashVersionProcessor( + req, + mock(SolrQueryResponse.class), + mock(UpdateRequestProcessor.class), + List.of( + SECOND_FIELD, + FIRST_FIELD, + "field1*"), // Also change order of config (test reorder of field names) + List.of()); // Given (doc for update) - SolrInputDocument inputDocument = doc( - f(ID_FIELD, "0000000001"), - f(FIRST_FIELD, "constant to have a constant hash for field1"), - f(SECOND_FIELD, "constant to have a constant hash for field2"), - f(THIRD_FIELD, UUID.randomUUID().toString()), - f(FOURTH_FIELD, "constant to have a constant hash for field4") - ); + SolrInputDocument inputDocument = + doc( + f(ID_FIELD, "0000000001"), + f(FIRST_FIELD, "constant to have a constant hash for field1"), + f(SECOND_FIELD, "constant to have a constant hash for field2"), + f(THIRD_FIELD, UUID.randomUUID().toString()), + f(FOURTH_FIELD, "constant to have a constant hash for field4")); // Then assertEquals("XavrOYGlkXM=", processorWithDuplicatedFieldName.computeDocHash(inputDocument)); @@ -208,12 +249,13 @@ public void shouldDedupIncludedFields() { public void shouldCreateSignatureForNewDoc() throws IOException { // Given SolrQueryResponse response = mock(SolrQueryResponse.class); - ContentHashVersionProcessor processor = getContentHashVersionProcessor( - req, - response, - mock(UpdateRequestProcessor.class), - Arrays.asList(FIRST_FIELD, SECOND_FIELD), - Collections.emptyList()); + ContentHashVersionProcessor processor = + getContentHashVersionProcessor( + req, + response, + mock(UpdateRequestProcessor.class), + Arrays.asList(FIRST_FIELD, SECOND_FIELD), + List.of()); processor.setDiscardSameDocuments(false); processor.setOldDocProvider((core, hashField, indexedDocId) -> null); @@ -221,11 +263,11 @@ public void shouldCreateSignatureForNewDoc() throws IOException { AddUpdateCommand cmd = new AddUpdateCommand(req); // Given (doc for update) - SolrInputDocument inputDocument = doc( - f(ID_FIELD, UUID.randomUUID().toString()), - f(FIRST_FIELD, "Values will serve as input to compute a hash"), - f(SECOND_FIELD, "This a constant value for testing include/exclude fields") - ); + SolrInputDocument inputDocument = + doc( + f(ID_FIELD, UUID.randomUUID().toString()), + f(FIRST_FIELD, "Values will serve as input to compute a hash"), + f(SECOND_FIELD, "This a constant value for testing include/exclude fields")); cmd.solrDoc = inputDocument; // When @@ -240,53 +282,57 @@ public void shouldCreateSignatureForNewDoc() throws IOException { // Then (asserts on hash comparison results) verify(response, times(1)).addToLog(eq("numAddsExisting"), eq(0)); - verify(response, times(1)).addToLog(eq("numAddsExistingWithIdentical"), eq(0)); // And no hash clash with old doc + verify(response, times(1)) + .addToLog(eq("numAddsExistingWithIdentical"), eq(0)); // And no hash clash with old doc } @Test public void shouldAddToResponseLog() throws IOException { // Given SolrQueryResponse response = mock(SolrQueryResponse.class); - ContentHashVersionProcessor processor = getContentHashVersionProcessor( - req, - response, - mock(UpdateRequestProcessor.class), - Arrays.asList(FIRST_FIELD, SECOND_FIELD), - Collections.emptyList()); + ContentHashVersionProcessor processor = + getContentHashVersionProcessor( + req, + response, + mock(UpdateRequestProcessor.class), + Arrays.asList(FIRST_FIELD, SECOND_FIELD), + List.of()); processor.setDiscardSameDocuments(false); // Given (command to update existing doc) AddUpdateCommand cmdDoesNotChangeValues = new AddUpdateCommand(req); // Given (doc for update - matches the existing doc, see getContentHashVersionProcessor()) - SolrInputDocument initialDocument = doc( - f(ID_FIELD, UUID.randomUUID().toString()), - f(FIRST_FIELD, "Initial values used to compute initial hash"), - f(SECOND_FIELD, "This a constant value for testing include/exclude fields") - ); - cmdDoesNotChangeValues.solrDoc = doc( - f(ID_FIELD, initialDocument.getFieldValue(ID_FIELD)), - f(FIRST_FIELD, initialDocument.getFieldValue(FIRST_FIELD)), - f(SECOND_FIELD, initialDocument.getFieldValue(SECOND_FIELD)), - f(HASH_FIELD_NAME, processor.computeDocHash(initialDocument)) - ); + SolrInputDocument initialDocument = + doc( + f(ID_FIELD, UUID.randomUUID().toString()), + f(FIRST_FIELD, "Initial values used to compute initial hash"), + f(SECOND_FIELD, "This a constant value for testing include/exclude fields")); + cmdDoesNotChangeValues.solrDoc = + doc( + f(ID_FIELD, initialDocument.getFieldValue(ID_FIELD)), + f(FIRST_FIELD, initialDocument.getFieldValue(FIRST_FIELD)), + f(SECOND_FIELD, initialDocument.getFieldValue(SECOND_FIELD)), + f(HASH_FIELD_NAME, processor.computeDocHash(initialDocument))); // Given (command to update existing doc with different content) AddUpdateCommand cmdChangesDocValues = new AddUpdateCommand(req); - // Given (doc for update - does *not* match the existing doc, see getContentHashVersionProcessor()) - cmdChangesDocValues.solrDoc = doc( - f(ID_FIELD, UUID.randomUUID().toString()), - f(FIRST_FIELD, "This is a doc with values"), - f(SECOND_FIELD, "that differs from stored doc, so it's considered new") - ); + // Given (doc for update - does *not* match the existing doc, see + // getContentHashVersionProcessor()) + cmdChangesDocValues.solrDoc = + doc( + f(ID_FIELD, UUID.randomUUID().toString()), + f(FIRST_FIELD, "This is a doc with values"), + f(SECOND_FIELD, "that differs from stored doc, so it's considered new")); // When processor.processAdd(cmdDoesNotChangeValues); processor.processAdd(cmdChangesDocValues); processor.finish(); - // Then (read as follows: 2 updates occurred for an existing doc. Among these updates, 1 update tried to replace + // Then (read as follows: 2 updates occurred for an existing doc. Among these updates, 1 update + // tried to replace // doc with the same content) verify(response, times(1)).addToLog(eq("numAddsExisting"), eq(2)); verify(response, times(1)).addToLog(eq("numAddsExistingWithIdentical"), eq(1)); @@ -296,22 +342,19 @@ public void shouldAddToResponseLog() throws IOException { public void shouldNotUpdateSignatureForNewDoc() throws IOException { // Given SolrQueryResponse response = mock(SolrQueryResponse.class); - ContentHashVersionProcessor processor = getContentHashVersionProcessor( - req, - response, - mock(UpdateRequestProcessor.class), - List.of(SECOND_FIELD), - Collections.emptyList()); + ContentHashVersionProcessor processor = + getContentHashVersionProcessor( + req, response, mock(UpdateRequestProcessor.class), List.of(SECOND_FIELD), List.of()); // Given (command) AddUpdateCommand cmd = new AddUpdateCommand(req); // Given (doc for update) - SolrInputDocument inputDocument = doc( - f(ID_FIELD, UUID.randomUUID().toString()), - f(FIRST_FIELD, "Values will serve as input to compute a hash"), - f(SECOND_FIELD, "This a constant value for testing include/exclude fields") - ); + SolrInputDocument inputDocument = + doc( + f(ID_FIELD, UUID.randomUUID().toString()), + f(FIRST_FIELD, "Values will serve as input to compute a hash"), + f(SECOND_FIELD, "This a constant value for testing include/exclude fields")); cmd.solrDoc = inputDocument; // When @@ -330,23 +373,24 @@ public void shouldNotUpdateSignatureForNewDoc() throws IOException { public void shouldExcludeFieldsUpdateSignatureForNewDoc() throws IOException { // Given SolrQueryResponse response = mock(SolrQueryResponse.class); - ContentHashVersionProcessor processor = getContentHashVersionProcessor( - req, - response, - mock(UpdateRequestProcessor.class), - List.of(FIRST_FIELD, SECOND_FIELD), - List.of(FIRST_FIELD)); + ContentHashVersionProcessor processor = + getContentHashVersionProcessor( + req, + response, + mock(UpdateRequestProcessor.class), + List.of(FIRST_FIELD, SECOND_FIELD), + List.of(FIRST_FIELD)); processor.setDiscardSameDocuments(false); // Given (command) AddUpdateCommand cmd = new AddUpdateCommand(req); // Given (doc for update) - SolrInputDocument inputDocument = doc( - f(ID_FIELD, UUID.randomUUID().toString()), - f(FIRST_FIELD, "Values will serve as input to compute a hash"), - f(SECOND_FIELD, "This a constant value for testing include/exclude fields") - ); + SolrInputDocument inputDocument = + doc( + f(ID_FIELD, UUID.randomUUID().toString()), + f(FIRST_FIELD, "Values will serve as input to compute a hash"), + f(SECOND_FIELD, "This a constant value for testing include/exclude fields")); cmd.solrDoc = inputDocument; // When @@ -365,29 +409,30 @@ public void shouldExcludeFieldsUpdateSignatureForNewDoc() throws IOException { public void shouldCommitWithDiscardModeEnabled() throws IOException { // Given UpdateRequestProcessor nextProcessor = mock(UpdateRequestProcessor.class); - ContentHashVersionProcessor processor = getContentHashVersionProcessor( - req, - mock(SolrQueryResponse.class), - nextProcessor, - List.of(FIRST_FIELD, SECOND_FIELD), - List.of(FIRST_FIELD)); + ContentHashVersionProcessor processor = + getContentHashVersionProcessor( + req, + mock(SolrQueryResponse.class), + nextProcessor, + List.of(FIRST_FIELD, SECOND_FIELD), + List.of(FIRST_FIELD)); processor.setDiscardSameDocuments(true); // Given (command to update existing doc) AddUpdateCommand cmdDoesNotChangeValues = new AddUpdateCommand(req); // Given (doc for update - matches the existing doc, see getContentHashVersionProcessor()) - SolrInputDocument initialDocument = doc( - f(ID_FIELD, UUID.randomUUID().toString()), - f(FIRST_FIELD, "Initial values used to compute initial hash"), - f(SECOND_FIELD, "This a constant value for testing include/exclude fields") - ); - cmdDoesNotChangeValues.solrDoc = doc( - f(ID_FIELD, initialDocument.getFieldValue(ID_FIELD)), - f(FIRST_FIELD, initialDocument.getFieldValue(FIRST_FIELD)), - f(SECOND_FIELD, initialDocument.getFieldValue(SECOND_FIELD)), - f(HASH_FIELD_NAME, processor.computeDocHash(initialDocument)) - ); + SolrInputDocument initialDocument = + doc( + f(ID_FIELD, UUID.randomUUID().toString()), + f(FIRST_FIELD, "Initial values used to compute initial hash"), + f(SECOND_FIELD, "This a constant value for testing include/exclude fields")); + cmdDoesNotChangeValues.solrDoc = + doc( + f(ID_FIELD, initialDocument.getFieldValue(ID_FIELD)), + f(FIRST_FIELD, initialDocument.getFieldValue(FIRST_FIELD)), + f(SECOND_FIELD, initialDocument.getFieldValue(SECOND_FIELD)), + f(HASH_FIELD_NAME, processor.computeDocHash(initialDocument))); // When processor.processAdd(cmdDoesNotChangeValues); @@ -395,5 +440,4 @@ public void shouldCommitWithDiscardModeEnabled() throws IOException { // Then verify(nextProcessor, never()).processAdd(eq(cmdDoesNotChangeValues)); } - } From cd3b510a97f2c11acfbd6566600eed5a2ee4d835 Mon Sep 17 00:00:00 2001 From: fhuaulme Date: Fri, 3 Apr 2026 18:52:45 +0200 Subject: [PATCH 3/6] * java lint fixes --- .../solr/update/processor/ContentHashVersionProcessor.java | 1 + 1 file changed, 1 insertion(+) diff --git a/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java b/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java index f4b40221a1b..f728e0905d4 100644 --- a/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java +++ b/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java @@ -87,6 +87,7 @@ public ContentHashVersionProcessor( this.excludedFields = hashExcludedFields; } + @Override public void processAdd(AddUpdateCommand cmd) throws IOException { SolrInputDocument newDoc = cmd.getSolrInputDocument(); String newHash = computeDocHash(newDoc); From f4fe9f2cda65bb3c351b8eb47e5290798c1c7423 Mon Sep 17 00:00:00 2001 From: fhuaulme Date: Thu, 9 Apr 2026 14:06:37 +0200 Subject: [PATCH 4/6] (review) =URP= * Remove OldDocProvider internal interface * Rework responses params (to include URP name) * Rework hash computation to prevent successive loops over fields * Hash field needs to be explicitly specified in config * Removes per request enablement of URP =Javadoc/naming= * Take into account doc values/field values in javadoc * Prefer "drop" over "discard" in doc/variables/method names * Removes remaining Collections.singletonList() =Unit test= * Adds utility method in SolrTestCaseJ4 to get update response * Rework unit test to use actual URP chain, remove mocks * Improve test coverage for URP factory parameters (excluded fields) * Improve test coverage for URP (drop, log modes, multi-valued fields) --- .../ContentHashVersionProcessor.java | 169 +++--- .../ContentHashVersionProcessorFactory.java | 44 +- .../solr/collection1/conf/schema16.xml | 33 ++ .../conf/solrconfig-contenthashversion.xml | 66 +++ ...ontentHashVersionProcessorFactoryTest.java | 82 +-- .../ContentHashVersionProcessorTest.java | 542 +++++++++--------- .../java/org/apache/solr/SolrTestCaseJ4.java | 10 +- 7 files changed, 491 insertions(+), 455 deletions(-) create mode 100644 solr/core/src/test-files/solr/collection1/conf/schema16.xml create mode 100644 solr/core/src/test-files/solr/collection1/conf/solrconfig-contenthashversion.xml diff --git a/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java b/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java index f728e0905d4..c17dada5eb1 100644 --- a/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java +++ b/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java @@ -21,8 +21,8 @@ import java.lang.invoke.MethodHandles; import java.util.Base64; import java.util.Collection; -import java.util.List; import java.util.Objects; +import java.util.Optional; import java.util.Set; import java.util.function.Predicate; import org.apache.lucene.util.BytesRef; @@ -41,21 +41,22 @@ import org.slf4j.LoggerFactory; /** - * An implementation of {@link UpdateRequestProcessor} which computes a hash of selected doc values, - * and uses this hash value to reject/accept doc updates. + * An implementation of {@link UpdateRequestProcessor} which computes a hash of field values, and + * uses this hash to reject/accept document updates. * *

    - *
  • When no corresponding doc with same id exists (create), computed hash is added to the - * document. - *
  • When a previous doc exists (update), a new hash is computed using new version values and - * compared with old hash. + *
  • When no corresponding document with same id exists (create), the computed hash is added to + * the document. + *
  • When a previous document exists (update), a new hash is computed from the incoming field + * values and compared with the stored hash. *
* - * Depending on {#discardSameDocuments} value, this processor may reject or accept doc update. This - * implementation can be used for monitoring or rejecting no-op updates (updates that do not change - * Solr document). + *

Depending on {#dropSameDocuments} value, this processor may drop or accept document updates. + * This implementation can be used for monitoring or dropping no-op updates (updates that do not + * change the Solr document content). * - *

Note: hash is computed using {@link Lookup3Signature}. + *

Note: the hash is computed using {@link Lookup3Signature} and must be stored in a field with + * docValues enabled for retrieval. * * @see Lookup3Signature */ @@ -66,11 +67,9 @@ public class ContentHashVersionProcessor extends UpdateRequestProcessor { private final SolrCore core; private final Predicate includedFields; // Matcher for included fields in hash private final Predicate excludedFields; // Matcher for excluded fields from hash - private OldDocProvider oldDocProvider = new DefaultDocProvider(); - private boolean discardSameDocuments; + private boolean dropSameDocuments; private int sameCount = 0; private int differentCount = 0; - private int unknownCount = 0; public ContentHashVersionProcessor( Predicate hashIncludedFields, @@ -94,7 +93,7 @@ public void processAdd(AddUpdateCommand cmd) throws IOException { newDoc.setField(hashField.getName(), newHash); int i = 0; - if (!validateHash(cmd.getIndexedId(), newHash)) { + if (!isHashAcceptable(cmd.getIndexedId(), newHash)) { return; } @@ -117,9 +116,22 @@ public void finish() throws IOException { try { super.finish(); } finally { - rsp.addToLog("numAddsExisting", sameCount + differentCount + unknownCount); - rsp.addToLog("numAddsExistingWithIdentical", sameCount); - rsp.addToLog("numAddsExistingUnknown", unknownCount); + // Only log when there are updates to existing documents + int totalUpdates = sameCount + differentCount; + if (totalUpdates > 0) { + if (dropSameDocuments) { + rsp.addToLog("contentHash.duplicatesDropped", sameCount); + rsp.addToLog("contentHash.duplicatesDetected", sameCount); + } else { + rsp.addToLog("contentHash.duplicatesDropped", 0); + rsp.addToLog("contentHash.duplicatesDetected", sameCount); + } + rsp.addToLog("contentHash.changed", differentCount); + } else { + rsp.addToLog("contentHash.duplicatesDropped", 0); + rsp.addToLog("contentHash.duplicatesDetected", 0); + rsp.addToLog("contentHash.changed", 0); + } } } @@ -129,27 +141,19 @@ private static void logOverlyFailedRetries(int i, UpdateCommand cmd) { } } - void setOldDocProvider(OldDocProvider oldDocProvider) { - this.oldDocProvider = oldDocProvider; + void setDropSameDocuments(boolean dropSameDocuments) { + this.dropSameDocuments = dropSameDocuments; } - void setDiscardSameDocuments(boolean discardSameDocuments) { - this.discardSameDocuments = discardSameDocuments; - } - - private boolean validateHash(BytesRef indexedDocId, String newHash) throws IOException { + private boolean isHashAcceptable(BytesRef indexedDocId, String newHash) throws IOException { assert null != indexedDocId; - var docFoundAndOldUserVersions = getOldUserVersionsFromStored(indexedDocId); - if (docFoundAndOldUserVersions.found) { - String oldHash = - docFoundAndOldUserVersions.oldHash; // No hash: might want to keep track of these too - if (oldHash == null) { - unknownCount++; - return true; - } else if (Objects.equals(newHash, oldHash)) { + Optional oldDocHash = getOldDocHash(indexedDocId); + if (oldDocHash.isPresent()) { + String oldHash = oldDocHash.get(); // No hash: might want to keep track of these too + if (Objects.equals(newHash, oldHash)) { sameCount++; - return !discardSameDocuments; + return !dropSameDocuments; } else { differentCount++; return true; @@ -158,80 +162,41 @@ private boolean validateHash(BytesRef indexedDocId, String newHash) throws IOExc return true; // Doc not found } - private DocFoundAndOldUserAndSolrVersions getOldUserVersionsFromStored(BytesRef indexedDocId) - throws IOException { - SolrInputDocument oldDoc = oldDocProvider.getDocument(core, hashField.getName(), indexedDocId); - return null == oldDoc - ? DocFoundAndOldUserAndSolrVersions.NOT_FOUND - : getUserVersionAndSolrVersionFromDocument(oldDoc); - } - - private DocFoundAndOldUserAndSolrVersions getUserVersionAndSolrVersionFromDocument( - SolrInputDocument oldDoc) { - Object o = oldDoc.getFieldValue(hashField.getName()); - if (o != null) { - return new DocFoundAndOldUserAndSolrVersions(o.toString()); + /** Retrieves the hash value from the old document identified by the given ID. */ + private Optional getOldDocHash(BytesRef indexedDocId) throws IOException { + SolrInputDocument oldDoc = + RealTimeGetComponent.getInputDocument( + core, indexedDocId, indexedDocId, null, Set.of(hashField.getName()), Resolution.DOC); + if (oldDoc == null) { + return Optional.empty(); } - return new DocFoundAndOldUserAndSolrVersions(); + Object o = oldDoc.getFieldValue(hashField.getName()); + return Optional.ofNullable(o).map(String::valueOf); } - public String computeDocHash(SolrInputDocument doc) { - List docIncludedFieldNames = - doc.getFieldNames().stream() - .filter(includedFields) // Keep fields that match 'included fields' matcher... - .filter( - excludedFields - .negate()) // ...and exclude fields that match 'excluded fields' matcher - .sorted() // Sort to ensure consistent field order across different doc field orders - .toList(); - + String computeDocHash(SolrInputDocument doc) { final Signature sig = new Lookup3Signature(); - for (String fieldName : docIncludedFieldNames) { - sig.add(fieldName); - Object o = doc.getFieldValue(fieldName); - if (o instanceof Collection) { - for (Object oo : (Collection) o) { - sig.add(String.valueOf(oo)); - } - } else { - sig.add(String.valueOf(o)); - } - } + + // Stream field names, filter, sort, and process in a single pass + doc.getFieldNames().stream() + .filter(includedFields) // Keep fields that match 'included fields' matcher + .filter(excludedFields.negate()) // Exclude fields that match 'excluded fields' matcher + .sorted() // Sort to ensure consistent field order across different doc field orders + .forEach( + fieldName -> { + sig.add(fieldName); + Object o = doc.getFieldValue(fieldName); + if (o instanceof Collection) { + for (Object oo : (Collection) o) { + sig.add(String.valueOf(oo)); + } + } else { + sig.add(String.valueOf(o)); + } + }); // Signature, depending on implementation, may return 8-byte or 16-byte value byte[] signature = sig.getSignature(); - return Base64.getEncoder() - .encodeToString(signature); // Makes a base64 hash out of signature value - } - - interface OldDocProvider { - SolrInputDocument getDocument(SolrCore core, String hashField, BytesRef indexedDocId) - throws IOException; - } - - private static class DefaultDocProvider implements OldDocProvider { - @Override - public SolrInputDocument getDocument(SolrCore core, String hashField, BytesRef indexedDocId) - throws IOException { - return RealTimeGetComponent.getInputDocument( - core, indexedDocId, indexedDocId, null, Set.of(hashField), Resolution.PARTIAL); - } - } - - private static class DocFoundAndOldUserAndSolrVersions { - private static final DocFoundAndOldUserAndSolrVersions NOT_FOUND = - new DocFoundAndOldUserAndSolrVersions(); - private final boolean found; - - public String oldHash; - - private DocFoundAndOldUserAndSolrVersions() { - this.found = false; - } - - private DocFoundAndOldUserAndSolrVersions(String oldHash) { - this.found = true; - this.oldHash = oldHash; - } + return Base64.getEncoder().encodeToString(signature); } } diff --git a/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessorFactory.java b/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessorFactory.java index 48fa52db928..bdff30f062d 100644 --- a/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessorFactory.java +++ b/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessorFactory.java @@ -18,7 +18,6 @@ package org.apache.solr.update.processor; import java.util.ArrayList; -import java.util.Collections; import java.util.List; import java.util.Locale; import java.util.function.Predicate; @@ -35,14 +34,10 @@ public class ContentHashVersionProcessorFactory extends UpdateRequestProcessorFactory implements SolrCoreAware, UpdateRequestProcessorFactory.RunAlways { private static final char SEPARATOR = ','; // Separator for included/excluded fields - static final String CONTENT_HASH_ENABLED_PARAM = "contentHashEnabled"; - private List includeFields = - Collections.singletonList("*"); // Included fields defaults to 'all' + private List includeFields = List.of("*"); // Included fields defaults to 'all' private List excludeFields = new ArrayList<>(); - // No excluded field by default, yet hashFieldName is excluded by default - private String hashFieldName = - "content_hash"; // Field name to store computed hash on create/update operations - private boolean discardSameDocuments = true; + private String hashFieldName; // Must be explicitly configured + private boolean dropSameDocuments = true; public ContentHashVersionProcessorFactory() {} @@ -61,13 +56,16 @@ public void init(NamedList args) { .collect(Collectors.toList()); } tmp = args.remove("hashFieldName"); - if (tmp != null) { - if (!(tmp instanceof String)) { - throw new SolrException( - SolrException.ErrorCode.SERVER_ERROR, "'hashFieldName' must be configured as a "); - } - this.hashFieldName = (String) tmp; + if (tmp == null) { + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, + "'hashFieldName' is required and must be explicitly configured"); } + if (!(tmp instanceof String)) { + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, "'hashFieldName' must be configured as a "); + } + this.hashFieldName = (String) tmp; tmp = args.remove("excludeFields"); if (tmp != null) { @@ -97,16 +95,16 @@ public void init(NamedList args) { "'hashCompareStrategy' must be configured as a "); } String value = ((String) tmp).toLowerCase(Locale.ROOT); - if ("discard".equalsIgnoreCase(value)) { - discardSameDocuments = true; + if ("drop".equalsIgnoreCase(value)) { + dropSameDocuments = true; } else if ("log".equalsIgnoreCase(value)) { - discardSameDocuments = false; + dropSameDocuments = false; } else { throw new SolrException( SolrException.ErrorCode.SERVER_ERROR, "Value '" + value - + "' is unsupported for 'hashCompareStrategy', only 'discard' and 'log' are supported."); + + "' is unsupported for 'hashCompareStrategy', only 'drop' and 'log' are supported."); } } @@ -115,10 +113,6 @@ public void init(NamedList args) { public UpdateRequestProcessor getInstance( SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor next) { - if (!req.getParams().getBool(CONTENT_HASH_ENABLED_PARAM, false)) { - return next; - } - ContentHashVersionProcessor processor = new ContentHashVersionProcessor( buildFieldMatcher(includeFields), @@ -127,7 +121,7 @@ public UpdateRequestProcessor getInstance( req, rsp, next); - processor.setDiscardSameDocuments(discardSameDocuments); + processor.setDropSameDocuments(dropSameDocuments); return processor; } @@ -150,8 +144,8 @@ public List getExcludeFields() { return excludeFields; } - public boolean discardSameDocuments() { - return discardSameDocuments; + public boolean dropSameDocuments() { + return dropSameDocuments; } static Predicate buildFieldMatcher(List fieldNames) { diff --git a/solr/core/src/test-files/solr/collection1/conf/schema16.xml b/solr/core/src/test-files/solr/collection1/conf/schema16.xml new file mode 100644 index 00000000000..3cfdcc119fe --- /dev/null +++ b/solr/core/src/test-files/solr/collection1/conf/schema16.xml @@ -0,0 +1,33 @@ + + + + + + + + + + + + + + + + + _id + diff --git a/solr/core/src/test-files/solr/collection1/conf/solrconfig-contenthashversion.xml b/solr/core/src/test-files/solr/collection1/conf/solrconfig-contenthashversion.xml new file mode 100644 index 00000000000..5afe4b167cf --- /dev/null +++ b/solr/core/src/test-files/solr/collection1/conf/solrconfig-contenthashversion.xml @@ -0,0 +1,66 @@ + + + + + + + ${tests.luceneMatchVersion:LATEST} + + + + + ${solr.data.dir:} + + + + + + ${solr.ulog.dir:} + + + + + + + _hash_ + _id + + + + + + + _hash_ + _id + log + + + + + + + _hash_ + _id + drop + + + + + + + diff --git a/solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorFactoryTest.java b/solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorFactoryTest.java index 2f2e920ac6d..807a1987264 100644 --- a/solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorFactoryTest.java +++ b/solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorFactoryTest.java @@ -17,22 +17,13 @@ package org.apache.solr.update.processor; import static org.apache.solr.SolrTestCaseJ4.assumeWorkingMockito; -import static org.apache.solr.update.processor.ContentHashVersionProcessorFactory.CONTENT_HASH_ENABLED_PARAM; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertNotEquals; import static org.junit.Assert.assertTrue; -import static org.mockito.ArgumentMatchers.anyBoolean; -import static org.mockito.ArgumentMatchers.eq; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.when; import java.util.List; import org.apache.solr.common.SolrException; -import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.util.NamedList; -import org.apache.solr.request.SolrQueryRequest; -import org.apache.solr.response.SolrQueryResponse; import org.junit.BeforeClass; import org.junit.Test; @@ -47,8 +38,7 @@ public static void beforeClass() throws Exception { public void shouldHaveSensibleDefaultValues() { ContentHashVersionProcessorFactory factory = new ContentHashVersionProcessorFactory(); assertEquals(List.of("*"), factory.getIncludeFields()); - assertEquals("content_hash", factory.getHashFieldName()); - assertTrue(factory.discardSameDocuments()); + assertTrue(factory.dropSameDocuments()); } @Test @@ -65,6 +55,7 @@ public void shouldInitWithHashFieldName() { public void shouldInitWithAllField() { ContentHashVersionProcessorFactory factory = new ContentHashVersionProcessorFactory(); NamedList args = new NamedList<>(); + args.add("hashFieldName", "content_hash"); args.add("includeFields", "*"); factory.init(args); @@ -76,6 +67,7 @@ public void shouldInitWithAllField() { public void shouldInitWithIncludedFields() { ContentHashVersionProcessorFactory factory = new ContentHashVersionProcessorFactory(); NamedList args = new NamedList<>(); + args.add("hashFieldName", "content_hash"); args.add("includeFields", " field1,field2 , field3 "); factory.init(args); @@ -87,6 +79,7 @@ public void shouldInitWithIncludedFields() { public void shouldInitWithExcludedFields() { ContentHashVersionProcessorFactory factory = new ContentHashVersionProcessorFactory(); NamedList args = new NamedList<>(); + args.add("hashFieldName", "content_hash"); args.add("excludeFields", " field1,field2 , field3 "); factory.init(args); @@ -95,39 +88,32 @@ public void shouldInitWithExcludedFields() { } @Test - public void shouldSelectRejectSameHashStrategy() { + public void shouldSelectDropStrategy() { ContentHashVersionProcessorFactory factory = new ContentHashVersionProcessorFactory(); NamedList args = new NamedList<>(); - args.add("hashCompareStrategy", "discard"); + args.add("hashFieldName", "content_hash"); + args.add("hashCompareStrategy", "drop"); factory.init(args); - assertTrue(factory.discardSameDocuments()); + assertTrue(factory.dropSameDocuments()); } @Test public void shouldSelectLogStrategy() { ContentHashVersionProcessorFactory factory = new ContentHashVersionProcessorFactory(); NamedList args = new NamedList<>(); + args.add("hashFieldName", "content_hash"); args.add("hashCompareStrategy", "log"); factory.init(args); - assertFalse(factory.discardSameDocuments()); - } - - @Test - public void shouldSelectDiscardStrategy() { - ContentHashVersionProcessorFactory factory = new ContentHashVersionProcessorFactory(); - NamedList args = new NamedList<>(); - args.add("hashCompareStrategy", "discard"); - factory.init(args); - - assertTrue(factory.discardSameDocuments()); + assertFalse(factory.dropSameDocuments()); } @Test(expected = SolrException.class) public void shouldSelectUnsupportedStrategy() { ContentHashVersionProcessorFactory factory = new ContentHashVersionProcessorFactory(); NamedList args = new NamedList<>(); + args.add("hashFieldName", "content_hash"); args.add("hashCompareStrategy", "unsupported value"); factory.init(args); } @@ -136,41 +122,35 @@ public void shouldSelectUnsupportedStrategy() { public void shouldRejectExcludeAllFields() { ContentHashVersionProcessorFactory factory = new ContentHashVersionProcessorFactory(); NamedList args = new NamedList<>(); + args.add("hashFieldName", "content_hash"); args.add("excludeFields", "*"); factory.init(args); } - @Test - public void shouldDisableContentHashByQueryParameter() { + @Test(expected = SolrException.class) + public void shouldRequireExplicitHashFieldName() { ContentHashVersionProcessorFactory factory = new ContentHashVersionProcessorFactory(); - UpdateRequestProcessor next = mock(UpdateRequestProcessor.class); - SolrQueryRequest updateRequest = createUpdateRequest(false); // Request disables processor - - UpdateRequestProcessor instance = - factory.getInstance(updateRequest, mock(SolrQueryResponse.class), next); - - assertEquals(instance, next); + NamedList args = new NamedList<>(); + // Intentionally not setting hashFieldName + factory.init(args); } @Test - public void shouldEnableContentHashByQueryParameter() { + public void shouldAutoExcludeHashFieldFromHashComputation() { ContentHashVersionProcessorFactory factory = new ContentHashVersionProcessorFactory(); - UpdateRequestProcessor next = mock(UpdateRequestProcessor.class); - SolrQueryRequest updateRequest = createUpdateRequest(true); // Request enables processor - - UpdateRequestProcessor instance = - factory.getInstance(updateRequest, mock(SolrQueryResponse.class), next); - - assertNotEquals(instance, next); - } - - private static SolrQueryRequest createUpdateRequest(boolean enableContentHashParamValue) { - SolrQueryRequest req = mock(SolrQueryRequest.class); - SolrParams solrParams = mock(SolrParams.class); - when(solrParams.getBool(eq(CONTENT_HASH_ENABLED_PARAM), anyBoolean())) - .thenReturn(enableContentHashParamValue); - when(req.getParams()).thenReturn(solrParams); + NamedList args = new NamedList<>(); + args.add("hashFieldName", "my_hash_field"); + args.add("excludeFields", "field1,field2"); + factory.init(args); - return req; + // Hash field should be automatically added to excludeFields + assertEquals(3, factory.getExcludeFields().size()); + assertTrue( + "Should contain explicitly excluded field1", factory.getExcludeFields().contains("field1")); + assertTrue( + "Should contain explicitly excluded field2", factory.getExcludeFields().contains("field2")); + assertTrue( + "Should auto-exclude hash field name", + factory.getExcludeFields().contains("my_hash_field")); } } diff --git a/solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorTest.java b/solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorTest.java index 92ea8282f9e..6b9c908cbfe 100644 --- a/solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorTest.java +++ b/solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorTest.java @@ -16,27 +16,14 @@ */ package org.apache.solr.update.processor; -import static org.mockito.ArgumentMatchers.anyString; -import static org.mockito.ArgumentMatchers.eq; import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.never; -import static org.mockito.Mockito.times; -import static org.mockito.Mockito.verify; -import static org.mockito.Mockito.when; -import java.io.IOException; -import java.util.Arrays; import java.util.List; import java.util.UUID; -import org.apache.lucene.util.BytesRef; import org.apache.solr.common.SolrInputDocument; -import org.apache.solr.common.params.SolrParams; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.response.SolrQueryResponse; -import org.apache.solr.schema.IndexSchema; -import org.apache.solr.schema.SchemaField; -import org.apache.solr.schema.UUIDField; -import org.apache.solr.update.AddUpdateCommand; +import org.jspecify.annotations.NonNull; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; @@ -44,108 +31,73 @@ public class ContentHashVersionProcessorTest extends UpdateProcessorTestBase { public static final String ID_FIELD = "_id"; - public static final String HASH_FIELD_NAME = "_hash_"; public static final String FIRST_FIELD = "field1"; public static final String SECOND_FIELD = "field2"; public static final String THIRD_FIELD = "docField3"; public static final String FOURTH_FIELD = "field4"; - private SolrQueryRequest req; - private ContentHashVersionProcessor getContentHashVersionProcessor( - SolrQueryRequest req, - SolrQueryResponse rsp, - UpdateRequestProcessor next, - List includedFields, - List excludedFields) { - ContentHashVersionProcessor processor = - new ContentHashVersionProcessor( - ContentHashVersionProcessorFactory.buildFieldMatcher(includedFields), - ContentHashVersionProcessorFactory.buildFieldMatcher(excludedFields), - HASH_FIELD_NAME, - req, - rsp, - next); - - // Given (previous doc retrieval configuration) - processor.setOldDocProvider( - (core, hashField, indexedDocId) -> { - final SolrInputDocument inputDocument = - doc( - f(ID_FIELD, indexedDocId.utf8ToString()), - f(FIRST_FIELD, "Initial values used to compute initial hash"), - f(SECOND_FIELD, "This a constant value for testing include/exclude fields")); - return doc( - f(ID_FIELD, inputDocument.getFieldValue(ID_FIELD)), - f(FIRST_FIELD, inputDocument.getFieldValue(FIRST_FIELD)), - f(SECOND_FIELD, inputDocument.getFieldValue(SECOND_FIELD)), - f(hashField, processor.computeDocHash(inputDocument))); - }); - return processor; - } + public static final String INITIAL_DOC_ID = "1"; + public static final String INITIAL_FIELD1_VALUE = "Initial values used to compute initial hash"; + public static final String INITIAL_FIELD2_VALUE = + "This a constant value for testing include/exclude fields"; + public static final String[] INITIAL_DOC = + new String[] { + ID_FIELD, INITIAL_DOC_ID, + FIRST_FIELD, INITIAL_FIELD1_VALUE, + SECOND_FIELD, INITIAL_FIELD2_VALUE + }; + private String initialDocHash; @BeforeClass public static void beforeClass() throws Exception { - assumeWorkingMockito(); + initCore("solrconfig-contenthashversion.xml", "schema16.xml"); } @Before public void setUp() throws Exception { super.setUp(); + assertU(delQ("*:*")); + addDoc(adoc(INITIAL_DOC), "contenthashversion-default"); + assertU(commit()); - // Given (processor configuration) - req = mock(SolrQueryRequest.class); - when(req.getParams()).thenReturn(mock(SolrParams.class)); - - // Given (schema configuration) - IndexSchema indexSchema = mock(IndexSchema.class); - when(req.getSchema()).thenReturn(indexSchema); - when(indexSchema.getUniqueKeyField()).thenReturn(new SchemaField(ID_FIELD, new UUIDField())); - when(indexSchema.indexableUniqueKey(anyString())) - .then(invocationOnMock -> new BytesRef(invocationOnMock.getArgument(0).toString())); + // Query for the document and extract _hash_ field value + initialDocHash = getHashFieldValue(INITIAL_DOC_ID); } - @Test - public void shouldComputeHashForDoc() { - // Given - ContentHashVersionProcessor processor = - getContentHashVersionProcessor( - req, - mock(SolrQueryResponse.class), - mock(UpdateRequestProcessor.class), - List.of("*"), - List.of(ID_FIELD)); - - // Given (doc for update) - SolrInputDocument inputDocument1 = - doc( - f(ID_FIELD, UUID.randomUUID().toString()), - f(FIRST_FIELD, "Values will serve as input to compute a hash"), - f(SECOND_FIELD, "This a constant value for testing include/exclude fields")); - - // Then - assertEquals("Tak0G5a/DIE=", processor.computeDocHash(inputDocument1)); - - // Given (doc for update - with different order) - SolrInputDocument inputDocument2 = - doc( - f(ID_FIELD, UUID.randomUUID().toString()), - f(SECOND_FIELD, "This a constant value for testing include/exclude fields"), - f(FIRST_FIELD, "Values will serve as input to compute a hash")); + private static @NonNull String getHashFieldValue(String docId) throws Exception { + String response = h.query(req("q", ID_FIELD + ":" + docId, "fl", "_hash_")); + + // Parse XML response to extract _hash_ field value + // Response format: value + String hashPattern = ""; + int startIdx = response.indexOf(hashPattern); + if (startIdx == -1) { + fail("Hash field not found in document " + docId); + } + startIdx += hashPattern.length(); + int endIdx = response.indexOf("", startIdx); + if (endIdx == -1) { + fail("Hash field closing tag not found"); + } + return response.substring(startIdx, endIdx); + } - // Then (hash remain same, since id is excluded from signature fields) - assertEquals("Tak0G5a/DIE=", processor.computeDocHash(inputDocument2)); + private ContentHashVersionProcessor getContentHashVersionProcessor( + List includedFields, List excludedFields) { + return new ContentHashVersionProcessor( + ContentHashVersionProcessorFactory.buildFieldMatcher(includedFields), + ContentHashVersionProcessorFactory.buildFieldMatcher(excludedFields), + "_hash_", + mock(SolrQueryRequest.class), + mock(SolrQueryResponse.class), + mock(UpdateRequestProcessor.class)); } @Test public void shouldUseExcludedFieldsWildcard() { // Given ContentHashVersionProcessor processor = - getContentHashVersionProcessor( - req, - mock(SolrQueryResponse.class), - mock(UpdateRequestProcessor.class), - List.of("*"), - List.of("field*")); + getContentHashVersionProcessor(List.of("*"), List.of("field*")); // Given (doc for update) SolrInputDocument inputDocument = @@ -165,12 +117,7 @@ public void shouldUseExcludedFieldsWildcard() { public void shouldUseIncludedFieldsWildcard() { // Given ContentHashVersionProcessor processor = - getContentHashVersionProcessor( - req, - mock(SolrQueryResponse.class), - mock(UpdateRequestProcessor.class), - List.of("field*"), - List.of(THIRD_FIELD)); + getContentHashVersionProcessor(List.of("field*"), List.of(THIRD_FIELD)); // Given (doc for update) SolrInputDocument inputDocument = @@ -189,12 +136,7 @@ public void shouldUseIncludedFieldsWildcard() { public void shouldUseIncludedFieldsWildcard2() { // Given (variant of previous shouldUseIncludedFieldsWildcard, without the excludedField config) ContentHashVersionProcessor processor = - getContentHashVersionProcessor( - req, - mock(SolrQueryResponse.class), - mock(UpdateRequestProcessor.class), - List.of("field*"), - List.of()); + getContentHashVersionProcessor(List.of("field*"), List.of()); // Given (doc for update) SolrInputDocument inputDocument = @@ -213,22 +155,11 @@ public void shouldUseIncludedFieldsWildcard2() { public void shouldDedupIncludedFields() { // Given (processor to include field1 and field2 only) ContentHashVersionProcessor processorWithDuplicatedFieldName = - getContentHashVersionProcessor( - req, - mock(SolrQueryResponse.class), - mock(UpdateRequestProcessor.class), - List.of(FIRST_FIELD, FIRST_FIELD, SECOND_FIELD), - List.of()); - + getContentHashVersionProcessor(List.of(FIRST_FIELD, FIRST_FIELD, SECOND_FIELD), List.of()); ContentHashVersionProcessor processorWithWildcard = getContentHashVersionProcessor( - req, - mock(SolrQueryResponse.class), - mock(UpdateRequestProcessor.class), - List.of( - SECOND_FIELD, - FIRST_FIELD, - "field1*"), // Also change order of config (test reorder of field names) + List.of( // Also change order of config (test reorder of field names) + SECOND_FIELD, FIRST_FIELD, "field1*"), List.of()); // Given (doc for update) @@ -246,198 +177,257 @@ public void shouldDedupIncludedFields() { } @Test - public void shouldCreateSignatureForNewDoc() throws IOException { - // Given - SolrQueryResponse response = mock(SolrQueryResponse.class); - ContentHashVersionProcessor processor = - getContentHashVersionProcessor( - req, - response, - mock(UpdateRequestProcessor.class), - Arrays.asList(FIRST_FIELD, SECOND_FIELD), - List.of()); - processor.setDiscardSameDocuments(false); - processor.setOldDocProvider((core, hashField, indexedDocId) -> null); - - // Given (command) - AddUpdateCommand cmd = new AddUpdateCommand(req); + public void shouldCreateSignatureForNewDoc() throws Exception { + // When (update) + final String newDocId = UUID.randomUUID().toString(); + assertU( + adoc( + ID_FIELD, newDocId, + FIRST_FIELD, INITIAL_FIELD1_VALUE, + SECOND_FIELD, INITIAL_FIELD2_VALUE)); + assertU(commit()); - // Given (doc for update) - SolrInputDocument inputDocument = - doc( - f(ID_FIELD, UUID.randomUUID().toString()), - f(FIRST_FIELD, "Values will serve as input to compute a hash"), - f(SECOND_FIELD, "This a constant value for testing include/exclude fields")); - cmd.solrDoc = inputDocument; + // Then + final String hashFieldValueForNewDoc = getHashFieldValue(newDocId); + assertEquals(initialDocHash, hashFieldValueForNewDoc); + } - // When - processor.processAdd(cmd); - processor.finish(); + @Test + public void shouldAddToResponseLog() throws Exception { + // Given (command to update existing doc) + final String newDocId = UUID.randomUUID().toString(); + final SolrQueryResponse update1 = + addDocWithResponse( + adoc( + ID_FIELD, newDocId, + FIRST_FIELD, INITIAL_FIELD1_VALUE, + SECOND_FIELD, INITIAL_FIELD2_VALUE), + "contenthashversion-default"); + final SolrQueryResponse update2 = + addDocWithResponse( + adoc( + ID_FIELD, newDocId, + FIRST_FIELD, "This is a doc with values", + SECOND_FIELD, "that differs from stored doc, so it's considered new"), + "contenthashversion-default"); + assertU(commit()); // Then - assertNotNull(inputDocument.getField(HASH_FIELD_NAME)); // signature field got added - assertEquals( - processor.computeDocHash(inputDocument), - inputDocument.getField(HASH_FIELD_NAME).getValue()); // ... and contains expected value + assertResponse(update1, 0, 0, 0); + assertResponse(update2, 0, 0, 1); + } - // Then (asserts on hash comparison results) - verify(response, times(1)).addToLog(eq("numAddsExisting"), eq(0)); - verify(response, times(1)) - .addToLog(eq("numAddsExistingWithIdentical"), eq(0)); // And no hash clash with old doc + @Test + public void shouldKeepDuplicateDocumentsInLogMode() throws Exception { + // Given: Use log chain which detects but does NOT drop duplicates + final String docId = UUID.randomUUID().toString(); + + // When: Add a document + addDoc( + adoc( + ID_FIELD, docId, + FIRST_FIELD, "original value", + SECOND_FIELD, "original value 2"), + "contenthashversion-log"); + assertU(commit()); + String originalHash = getHashFieldValue(docId); + + // When: Try to add the same content again (duplicate) + SolrQueryResponse duplicateResponse = + addDocWithResponse( + adoc( + ID_FIELD, docId, + FIRST_FIELD, "original value", + SECOND_FIELD, "original value 2"), + "contenthashversion-log"); + assertU(commit()); + + // Then: Response should show duplicate was detected but NOT dropped + assertResponse(duplicateResponse, 0, 1, 0); + + // Then: Document should still exist in index + assertQ(req("q", ID_FIELD + ":" + docId), "//result[@numFound='1']"); + + // Then: Document hash should remain unchanged (duplicate was processed) + String currentHash = getHashFieldValue(docId); + assertEquals("Hash should remain unchanged for duplicate", originalHash, currentHash); + + // When: Update with different content + SolrQueryResponse changedResponse = + addDocWithResponse( + adoc( + ID_FIELD, docId, + FIRST_FIELD, "changed value", + SECOND_FIELD, "changed value 2"), + "contenthashversion-log"); + assertU(commit()); + + // Then: Response should show content changed + assertResponse(changedResponse, 0, 0, 1); + + // Then: Hash should be updated + String newHash = getHashFieldValue(docId); + assertNotEquals("Hash should change for different content", originalHash, newHash); } @Test - public void shouldAddToResponseLog() throws IOException { - // Given - SolrQueryResponse response = mock(SolrQueryResponse.class); - ContentHashVersionProcessor processor = - getContentHashVersionProcessor( - req, - response, - mock(UpdateRequestProcessor.class), - Arrays.asList(FIRST_FIELD, SECOND_FIELD), - List.of()); - processor.setDiscardSameDocuments(false); + public void shouldExcludeFieldsUpdateSignatureForNewDoc() throws Exception { + // Given (update using URP chain WITHOUT drop doc (log mode)) + final String newDocId = UUID.randomUUID().toString(); + addDoc( + adoc( + ID_FIELD, newDocId, + FIRST_FIELD, INITIAL_FIELD1_VALUE, + SECOND_FIELD, INITIAL_FIELD2_VALUE), + "contenthashversion-default"); + assertU(commit()); - // Given (command to update existing doc) - AddUpdateCommand cmdDoesNotChangeValues = new AddUpdateCommand(req); + // Then + final String hashFieldValue = getHashFieldValue(newDocId); + assertEquals(initialDocHash, hashFieldValue); + } - // Given (doc for update - matches the existing doc, see getContentHashVersionProcessor()) - SolrInputDocument initialDocument = - doc( - f(ID_FIELD, UUID.randomUUID().toString()), - f(FIRST_FIELD, "Initial values used to compute initial hash"), - f(SECOND_FIELD, "This a constant value for testing include/exclude fields")); - cmdDoesNotChangeValues.solrDoc = - doc( - f(ID_FIELD, initialDocument.getFieldValue(ID_FIELD)), - f(FIRST_FIELD, initialDocument.getFieldValue(FIRST_FIELD)), - f(SECOND_FIELD, initialDocument.getFieldValue(SECOND_FIELD)), - f(HASH_FIELD_NAME, processor.computeDocHash(initialDocument))); + @Test + public void shouldCommitWithDropModeEnabled() throws Exception { + // Initial document already exists from setUp() + // When: Try to add the same document again (duplicate content) using URP chain WITH drop doc + // (drop mode) + SolrQueryResponse solrQueryResponse = + addDocWithResponse( + adoc( + ID_FIELD, INITIAL_DOC_ID, + FIRST_FIELD, INITIAL_FIELD1_VALUE, + SECOND_FIELD, INITIAL_FIELD2_VALUE), + "contenthashversion-drop"); + assertU(commit()); + + // Then: Verify response shows duplicate was dropped + assertResponse(solrQueryResponse, 1, 1, 0); + + // Then: Verify document was NOT actually added/updated (still only 1 doc in index) + assertQ(req("q", "*:*"), "//result[@numFound='1']"); + + // Verify the document still has the original hash + String currentHash = getHashFieldValue(INITIAL_DOC_ID); + assertEquals("Document hash should not have changed", initialDocHash, currentHash); + } - // Given (command to update existing doc with different content) - AddUpdateCommand cmdChangesDocValues = new AddUpdateCommand(req); + @Test + public void shouldHandleDocumentWithOnlyIdField() { + // Given: Document with only ID field (no other fields to hash) + ContentHashVersionProcessor processor = + getContentHashVersionProcessor(List.of("*"), List.of(ID_FIELD)); - // Given (doc for update - does *not* match the existing doc, see - // getContentHashVersionProcessor()) - cmdChangesDocValues.solrDoc = - doc( - f(ID_FIELD, UUID.randomUUID().toString()), - f(FIRST_FIELD, "This is a doc with values"), - f(SECOND_FIELD, "that differs from stored doc, so it's considered new")); - - // When - processor.processAdd(cmdDoesNotChangeValues); - processor.processAdd(cmdChangesDocValues); - processor.finish(); - - // Then (read as follows: 2 updates occurred for an existing doc. Among these updates, 1 update - // tried to replace - // doc with the same content) - verify(response, times(1)).addToLog(eq("numAddsExisting"), eq(2)); - verify(response, times(1)).addToLog(eq("numAddsExistingWithIdentical"), eq(1)); + // When: Compute hash for document with only ID + SolrInputDocument doc = doc(f(ID_FIELD, "only-id-doc")); + + // Then: Should compute hash (even if empty field set) + String hash = processor.computeDocHash(doc); + assertNotNull("Hash should not be null for ID-only document", hash); + assertFalse("Hash should not be empty", hash.isEmpty()); } @Test - public void shouldNotUpdateSignatureForNewDoc() throws IOException { - // Given - SolrQueryResponse response = mock(SolrQueryResponse.class); + public void shouldHandleMultiValueFields() { + // Given: Processor that includes multi-value fields ContentHashVersionProcessor processor = - getContentHashVersionProcessor( - req, response, mock(UpdateRequestProcessor.class), List.of(SECOND_FIELD), List.of()); + getContentHashVersionProcessor(List.of("*"), List.of(ID_FIELD)); - // Given (command) - AddUpdateCommand cmd = new AddUpdateCommand(req); + // When: Document with multi-value field + SolrInputDocument doc1 = doc(f(ID_FIELD, "doc1"), f(FIRST_FIELD, "value1", "value2", "value3")); - // Given (doc for update) - SolrInputDocument inputDocument = - doc( - f(ID_FIELD, UUID.randomUUID().toString()), - f(FIRST_FIELD, "Values will serve as input to compute a hash"), - f(SECOND_FIELD, "This a constant value for testing include/exclude fields")); - cmd.solrDoc = inputDocument; + // Then: Should compute consistent hash + String hash1 = processor.computeDocHash(doc1); + assertNotNull(hash1); - // When - processor.processAdd(cmd); + // Same values in same order should produce same hash + SolrInputDocument doc2 = doc(f(ID_FIELD, "doc2"), f(FIRST_FIELD, "value1", "value2", "value3")); + String hash2 = processor.computeDocHash(doc2); + assertEquals("Same multi-value field should produce same hash", hash1, hash2); - // Then - assertNotNull(inputDocument.getField(HASH_FIELD_NAME)); // signature field got added - assertEquals( - processor.computeDocHash(inputDocument), - inputDocument.getField(HASH_FIELD_NAME).getValue()); // ... and contains expected value - verify(response, never()).addToLog(eq("numAddsExisting"), eq(0)); - verify(response, never()).addToLog(eq("numAddsExistingWithIdentical"), eq(0)); + // Different order should produce different hash (collection order matters) + SolrInputDocument doc3 = doc(f(ID_FIELD, "doc3"), f(FIRST_FIELD, "value3", "value1", "value2")); + String hash3 = processor.computeDocHash(doc3); + assertNotEquals("Different order should produce different hash", hash1, hash3); } @Test - public void shouldExcludeFieldsUpdateSignatureForNewDoc() throws IOException { - // Given - SolrQueryResponse response = mock(SolrQueryResponse.class); + public void shouldHandleNullFieldValues() { + // Given: Processor that handles null values ContentHashVersionProcessor processor = - getContentHashVersionProcessor( - req, - response, - mock(UpdateRequestProcessor.class), - List.of(FIRST_FIELD, SECOND_FIELD), - List.of(FIRST_FIELD)); - processor.setDiscardSameDocuments(false); + getContentHashVersionProcessor(List.of("*"), List.of(ID_FIELD)); - // Given (command) - AddUpdateCommand cmd = new AddUpdateCommand(req); + // When: Document with null field value (represented as "null" string) + SolrInputDocument doc = doc(f(ID_FIELD, "null-doc"), f(FIRST_FIELD, (Object) null)); - // Given (doc for update) - SolrInputDocument inputDocument = - doc( - f(ID_FIELD, UUID.randomUUID().toString()), - f(FIRST_FIELD, "Values will serve as input to compute a hash"), - f(SECOND_FIELD, "This a constant value for testing include/exclude fields")); - cmd.solrDoc = inputDocument; + // Then: Should compute hash without error + String hash = processor.computeDocHash(doc); + assertNotNull("Should handle null values", hash); + assertFalse("Hash should not be empty", hash.isEmpty()); + } - // When - processor.processAdd(cmd); + @Test + public void shouldProduceSameHashRegardlessOfFieldOrder() { + // Given: Documents with same fields in different order + ContentHashVersionProcessor processor = + getContentHashVersionProcessor(List.of("*"), List.of(ID_FIELD)); - // Then - assertNotNull(inputDocument.getField(HASH_FIELD_NAME)); // signature field got added - assertEquals( - processor.computeDocHash(inputDocument), - inputDocument.getField(HASH_FIELD_NAME).getValue()); // ... and contains expected value - verify(response, never()).addToLog(eq("numAddsExisting"), eq(1)); - verify(response, never()).addToLog(eq("numAddsExistingWithIdentical"), eq(0)); + // When: Create docs with fields in different order + SolrInputDocument doc1 = + doc( + f(ID_FIELD, "doc1"), + f(FIRST_FIELD, "value1"), + f(SECOND_FIELD, "value2"), + f(THIRD_FIELD, "value3")); + + SolrInputDocument doc2 = + doc( + f(ID_FIELD, "doc2"), + f(THIRD_FIELD, "value3"), + f(FIRST_FIELD, "value1"), + f(SECOND_FIELD, "value2")); + + // Then: Hashes should be identical (fields are sorted before hashing) + String hash1 = processor.computeDocHash(doc1); + String hash2 = processor.computeDocHash(doc2); + assertEquals("Hash should be same regardless of field order", hash1, hash2); } @Test - public void shouldCommitWithDiscardModeEnabled() throws IOException { - // Given - UpdateRequestProcessor nextProcessor = mock(UpdateRequestProcessor.class); + public void shouldHandleEmptyFieldValues() { + // Given: Document with empty string values ContentHashVersionProcessor processor = - getContentHashVersionProcessor( - req, - mock(SolrQueryResponse.class), - nextProcessor, - List.of(FIRST_FIELD, SECOND_FIELD), - List.of(FIRST_FIELD)); - processor.setDiscardSameDocuments(true); + getContentHashVersionProcessor(List.of("*"), List.of(ID_FIELD)); - // Given (command to update existing doc) - AddUpdateCommand cmdDoesNotChangeValues = new AddUpdateCommand(req); + SolrInputDocument doc1 = doc(f(ID_FIELD, "empty-doc"), f(FIRST_FIELD, ""), f(SECOND_FIELD, "")); - // Given (doc for update - matches the existing doc, see getContentHashVersionProcessor()) - SolrInputDocument initialDocument = - doc( - f(ID_FIELD, UUID.randomUUID().toString()), - f(FIRST_FIELD, "Initial values used to compute initial hash"), - f(SECOND_FIELD, "This a constant value for testing include/exclude fields")); - cmdDoesNotChangeValues.solrDoc = - doc( - f(ID_FIELD, initialDocument.getFieldValue(ID_FIELD)), - f(FIRST_FIELD, initialDocument.getFieldValue(FIRST_FIELD)), - f(SECOND_FIELD, initialDocument.getFieldValue(SECOND_FIELD)), - f(HASH_FIELD_NAME, processor.computeDocHash(initialDocument))); + // When: Compute hash + String hash1 = processor.computeDocHash(doc1); - // When - processor.processAdd(cmdDoesNotChangeValues); + // Then: Should produce valid hash + assertNotNull("Should handle empty values", hash1); + assertFalse("Hash should not be empty", hash1.isEmpty()); - // Then - verify(nextProcessor, never()).processAdd(eq(cmdDoesNotChangeValues)); + // Empty strings should produce different hash than no fields + SolrInputDocument doc2 = doc(f(ID_FIELD, "empty-doc")); + String hash2 = processor.computeDocHash(doc2); + assertNotEquals("Empty string fields should differ from no fields", hash1, hash2); + } + + private static void assertResponse( + SolrQueryResponse solrQueryResponse, + int droppedDocCount, + int duplicateDocCount, + int changedDocCount) { + assertNotNull(solrQueryResponse.getToLog().get("contentHash.duplicatesDropped")); + assertNotNull(solrQueryResponse.getToLog().get("contentHash.duplicatesDetected")); + assertNotNull(solrQueryResponse.getToLog().get("contentHash.changed")); + + int droppedDocs = (int) solrQueryResponse.getToLog().get("contentHash.duplicatesDropped"); + int duplicateDocs = (int) solrQueryResponse.getToLog().get("contentHash.duplicatesDetected"); + int changedDocs = (int) solrQueryResponse.getToLog().get("contentHash.changed"); + assertEquals(droppedDocCount, droppedDocs); + assertEquals(duplicateDocCount, duplicateDocs); + assertEquals(changedDocCount, changedDocs); } } diff --git a/solr/test-framework/src/java/org/apache/solr/SolrTestCaseJ4.java b/solr/test-framework/src/java/org/apache/solr/SolrTestCaseJ4.java index 3ff7e74aaa2..9df7e06cb6c 100644 --- a/solr/test-framework/src/java/org/apache/solr/SolrTestCaseJ4.java +++ b/solr/test-framework/src/java/org/apache/solr/SolrTestCaseJ4.java @@ -1161,6 +1161,11 @@ public static String adoc(SolrInputDocument sdoc) { } public static void addDoc(String doc, String updateRequestProcessorChain) throws Exception { + addDocWithResponse(doc, updateRequestProcessorChain); + } + + public static SolrQueryResponse addDocWithResponse(String doc, String updateRequestProcessorChain) + throws Exception { Map params = new HashMap<>(); MultiMapSolrParams mmparams = new MultiMapSolrParams(params); params.put(UpdateParams.UPDATE_CHAIN, new String[] {updateRequestProcessorChain}); @@ -1169,8 +1174,11 @@ public static void addDoc(String doc, String updateRequestProcessorChain) throws UpdateRequestHandler handler = new UpdateRequestHandler(); handler.init(null); req.setContentStreams(List.of(new ContentStreamBase.StringStream(doc))); - handler.handleRequestBody(req, new SolrQueryResponse()); + final SolrQueryResponse rsp = new SolrQueryResponse(); + handler.handleRequestBody(req, rsp); req.close(); + + return rsp; } /** From 566f109ce72b3afef6d70a6198124056c38671b3 Mon Sep 17 00:00:00 2001 From: fhuaulme Date: Fri, 10 Apr 2026 18:25:24 +0200 Subject: [PATCH 5/6] (review) =URP= * Remove while(true), move to for-i loop --- .../solr/update/processor/ContentHashVersionProcessor.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java b/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java index c17dada5eb1..ca168b107a7 100644 --- a/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java +++ b/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java @@ -91,13 +91,12 @@ public void processAdd(AddUpdateCommand cmd) throws IOException { SolrInputDocument newDoc = cmd.getSolrInputDocument(); String newHash = computeDocHash(newDoc); newDoc.setField(hashField.getName(), newHash); - int i = 0; if (!isHashAcceptable(cmd.getIndexedId(), newHash)) { return; } - while (true) { + for (int i = 0; ; i++) { logOverlyFailedRetries(i, cmd); try { super.processAdd(cmd); From d3a4235c18c6ae620c61181715af20edfed79b3c Mon Sep 17 00:00:00 2001 From: David Smiley Date: Fri, 10 Apr 2026 22:53:05 -0400 Subject: [PATCH 6/6] Use BinaryField; remove base64 / StringField usage --- .../ContentHashVersionProcessor.java | 34 ++++++++------ .../solr/collection1/conf/schema16.xml | 3 +- .../ContentHashVersionProcessorTest.java | 47 ++++++++++--------- 3 files changed, 46 insertions(+), 38 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java b/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java index ca168b107a7..f0d3a0b5443 100644 --- a/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java +++ b/solr/core/src/java/org/apache/solr/update/processor/ContentHashVersionProcessor.java @@ -19,9 +19,9 @@ import java.io.IOException; import java.lang.invoke.MethodHandles; -import java.util.Base64; +import java.nio.ByteBuffer; +import java.util.Arrays; import java.util.Collection; -import java.util.Objects; import java.util.Optional; import java.util.Set; import java.util.function.Predicate; @@ -33,8 +33,8 @@ import org.apache.solr.handler.component.RealTimeGetComponent.Resolution; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.response.SolrQueryResponse; +import org.apache.solr.schema.BinaryField; import org.apache.solr.schema.SchemaField; -import org.apache.solr.schema.TextField; import org.apache.solr.update.AddUpdateCommand; import org.apache.solr.update.UpdateCommand; import org.slf4j.Logger; @@ -80,7 +80,7 @@ public ContentHashVersionProcessor( UpdateRequestProcessor next) { super(next); this.core = req.getCore(); - this.hashField = new SchemaField(hashFieldName, new TextField()); + this.hashField = new SchemaField(hashFieldName, new BinaryField()); this.rsp = rsp; this.includedFields = hashIncludedFields; this.excludedFields = hashExcludedFields; @@ -89,7 +89,7 @@ public ContentHashVersionProcessor( @Override public void processAdd(AddUpdateCommand cmd) throws IOException { SolrInputDocument newDoc = cmd.getSolrInputDocument(); - String newHash = computeDocHash(newDoc); + byte[] newHash = computeDocHash(newDoc); newDoc.setField(hashField.getName(), newHash); if (!isHashAcceptable(cmd.getIndexedId(), newHash)) { @@ -144,13 +144,12 @@ void setDropSameDocuments(boolean dropSameDocuments) { this.dropSameDocuments = dropSameDocuments; } - private boolean isHashAcceptable(BytesRef indexedDocId, String newHash) throws IOException { + private boolean isHashAcceptable(BytesRef indexedDocId, byte[] newHash) throws IOException { assert null != indexedDocId; - Optional oldDocHash = getOldDocHash(indexedDocId); + Optional oldDocHash = getOldDocHash(indexedDocId); if (oldDocHash.isPresent()) { - String oldHash = oldDocHash.get(); // No hash: might want to keep track of these too - if (Objects.equals(newHash, oldHash)) { + if (Arrays.equals(newHash, oldDocHash.get())) { sameCount++; return !dropSameDocuments; } else { @@ -162,7 +161,7 @@ private boolean isHashAcceptable(BytesRef indexedDocId, String newHash) throws I } /** Retrieves the hash value from the old document identified by the given ID. */ - private Optional getOldDocHash(BytesRef indexedDocId) throws IOException { + private Optional getOldDocHash(BytesRef indexedDocId) throws IOException { SolrInputDocument oldDoc = RealTimeGetComponent.getInputDocument( core, indexedDocId, indexedDocId, null, Set.of(hashField.getName()), Resolution.DOC); @@ -170,10 +169,17 @@ private Optional getOldDocHash(BytesRef indexedDocId) throws IOException return Optional.empty(); } Object o = oldDoc.getFieldValue(hashField.getName()); - return Optional.ofNullable(o).map(String::valueOf); + if (o instanceof byte[] bytes) { + return Optional.of(bytes); + } else if (o instanceof ByteBuffer buf) { + byte[] bytes = new byte[buf.remaining()]; + buf.duplicate().get(bytes); + return Optional.of(bytes); + } + return Optional.empty(); } - String computeDocHash(SolrInputDocument doc) { + byte[] computeDocHash(SolrInputDocument doc) { final Signature sig = new Lookup3Signature(); // Stream field names, filter, sort, and process in a single pass @@ -194,8 +200,6 @@ String computeDocHash(SolrInputDocument doc) { } }); - // Signature, depending on implementation, may return 8-byte or 16-byte value - byte[] signature = sig.getSignature(); - return Base64.getEncoder().encodeToString(signature); + return sig.getSignature(); } } diff --git a/solr/core/src/test-files/solr/collection1/conf/schema16.xml b/solr/core/src/test-files/solr/collection1/conf/schema16.xml index 3cfdcc119fe..f34bfed2e4c 100644 --- a/solr/core/src/test-files/solr/collection1/conf/schema16.xml +++ b/solr/core/src/test-files/solr/collection1/conf/schema16.xml @@ -20,10 +20,11 @@ + - + diff --git a/solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorTest.java b/solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorTest.java index 6b9c908cbfe..38aa6b0a0b5 100644 --- a/solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorTest.java +++ b/solr/core/src/test/org/apache/solr/update/processor/ContentHashVersionProcessorTest.java @@ -18,6 +18,8 @@ import static org.mockito.Mockito.mock; +import java.util.Arrays; +import java.util.Base64; import java.util.List; import java.util.UUID; import org.apache.solr.common.SolrInputDocument; @@ -109,8 +111,9 @@ public void shouldUseExcludedFieldsWildcard() { f(FOURTH_FIELD, UUID.randomUUID().toString())); // Then (only ID and THIRD_FIELD is used in hash, other fields contain random values) - assertEquals( - "bwE8Zjq0aOs=", processor.computeDocHash(inputDocument)); // Hash if only ID field was used + assertArrayEquals( + Base64.getDecoder().decode("bwE8Zjq0aOs="), + processor.computeDocHash(inputDocument)); // Hash if only ID field was used } @Test @@ -129,7 +132,7 @@ public void shouldUseIncludedFieldsWildcard() { f(FOURTH_FIELD, "constant to have a constant hash for field4")); // Then - assertEquals("PozPs2qZQtw=", processor.computeDocHash(inputDocument)); + assertArrayEquals(Base64.getDecoder().decode("PozPs2qZQtw="), processor.computeDocHash(inputDocument)); } @Test @@ -148,7 +151,7 @@ public void shouldUseIncludedFieldsWildcard2() { f(FOURTH_FIELD, "constant to have a constant hash for field4")); // Then - assertEquals("PozPs2qZQtw=", processor.computeDocHash(inputDocument)); + assertArrayEquals(Base64.getDecoder().decode("PozPs2qZQtw="), processor.computeDocHash(inputDocument)); } @Test @@ -172,8 +175,8 @@ public void shouldDedupIncludedFields() { f(FOURTH_FIELD, "constant to have a constant hash for field4")); // Then - assertEquals("XavrOYGlkXM=", processorWithDuplicatedFieldName.computeDocHash(inputDocument)); - assertEquals("XavrOYGlkXM=", processorWithWildcard.computeDocHash(inputDocument)); + assertArrayEquals(Base64.getDecoder().decode("XavrOYGlkXM="), processorWithDuplicatedFieldName.computeDocHash(inputDocument)); + assertArrayEquals(Base64.getDecoder().decode("XavrOYGlkXM="), processorWithWildcard.computeDocHash(inputDocument)); } @Test @@ -322,9 +325,9 @@ public void shouldHandleDocumentWithOnlyIdField() { SolrInputDocument doc = doc(f(ID_FIELD, "only-id-doc")); // Then: Should compute hash (even if empty field set) - String hash = processor.computeDocHash(doc); + byte[] hash = processor.computeDocHash(doc); assertNotNull("Hash should not be null for ID-only document", hash); - assertFalse("Hash should not be empty", hash.isEmpty()); + assertTrue("Hash should not be empty", hash.length > 0); } @Test @@ -337,18 +340,18 @@ public void shouldHandleMultiValueFields() { SolrInputDocument doc1 = doc(f(ID_FIELD, "doc1"), f(FIRST_FIELD, "value1", "value2", "value3")); // Then: Should compute consistent hash - String hash1 = processor.computeDocHash(doc1); + byte[] hash1 = processor.computeDocHash(doc1); assertNotNull(hash1); // Same values in same order should produce same hash SolrInputDocument doc2 = doc(f(ID_FIELD, "doc2"), f(FIRST_FIELD, "value1", "value2", "value3")); - String hash2 = processor.computeDocHash(doc2); - assertEquals("Same multi-value field should produce same hash", hash1, hash2); + byte[] hash2 = processor.computeDocHash(doc2); + assertArrayEquals("Same multi-value field should produce same hash", hash1, hash2); // Different order should produce different hash (collection order matters) SolrInputDocument doc3 = doc(f(ID_FIELD, "doc3"), f(FIRST_FIELD, "value3", "value1", "value2")); - String hash3 = processor.computeDocHash(doc3); - assertNotEquals("Different order should produce different hash", hash1, hash3); + byte[] hash3 = processor.computeDocHash(doc3); + assertFalse("Different order should produce different hash", Arrays.equals(hash1, hash3)); } @Test @@ -361,9 +364,9 @@ public void shouldHandleNullFieldValues() { SolrInputDocument doc = doc(f(ID_FIELD, "null-doc"), f(FIRST_FIELD, (Object) null)); // Then: Should compute hash without error - String hash = processor.computeDocHash(doc); + byte[] hash = processor.computeDocHash(doc); assertNotNull("Should handle null values", hash); - assertFalse("Hash should not be empty", hash.isEmpty()); + assertTrue("Hash should not be empty", hash.length > 0); } @Test @@ -388,9 +391,9 @@ public void shouldProduceSameHashRegardlessOfFieldOrder() { f(SECOND_FIELD, "value2")); // Then: Hashes should be identical (fields are sorted before hashing) - String hash1 = processor.computeDocHash(doc1); - String hash2 = processor.computeDocHash(doc2); - assertEquals("Hash should be same regardless of field order", hash1, hash2); + byte[] hash1 = processor.computeDocHash(doc1); + byte[] hash2 = processor.computeDocHash(doc2); + assertArrayEquals("Hash should be same regardless of field order", hash1, hash2); } @Test @@ -402,16 +405,16 @@ public void shouldHandleEmptyFieldValues() { SolrInputDocument doc1 = doc(f(ID_FIELD, "empty-doc"), f(FIRST_FIELD, ""), f(SECOND_FIELD, "")); // When: Compute hash - String hash1 = processor.computeDocHash(doc1); + byte[] hash1 = processor.computeDocHash(doc1); // Then: Should produce valid hash assertNotNull("Should handle empty values", hash1); - assertFalse("Hash should not be empty", hash1.isEmpty()); + assertTrue("Hash should not be empty", hash1.length > 0); // Empty strings should produce different hash than no fields SolrInputDocument doc2 = doc(f(ID_FIELD, "empty-doc")); - String hash2 = processor.computeDocHash(doc2); - assertNotEquals("Empty string fields should differ from no fields", hash1, hash2); + byte[] hash2 = processor.computeDocHash(doc2); + assertFalse("Empty string fields should differ from no fields", Arrays.equals(hash1, hash2)); } private static void assertResponse(