From 32e79963a84042438caef9038768558a4b9396bc Mon Sep 17 00:00:00 2001 From: Chungmin Lee Date: Fri, 27 Mar 2026 14:54:15 -0700 Subject: [PATCH] Validate column index for null_pages/null_counts contradiction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A null_pages entry of true indicates the page consists entirely of null values; a corresponding null_counts entry of zero contradicts this, as it indicates the page has no null values at all. This inconsistency is a sign of invalid metadata that would cause incorrect page skipping during column index filtering — the reader treats such pages as all-null and silently excludes them from query results for predicates requiring non-null values. Add validation in ColumnIndexBuilder.build(PrimitiveType) to detect this contradiction and return null, consistent with the existing pattern for invalid min/max values. The null propagates through the read path, causing the filter to fall back to reading all pages for the affected column. Also fix a pre-existing NPE in the static build() method where build(type) could return null but was not null-checked before accessing boundaryOrder. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../columnindex/ColumnIndexBuilder.java | 16 ++ .../columnindex/TestColumnIndexBuilder.java | 148 ++++++++++++++++++ 2 files changed, 164 insertions(+) diff --git a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndexBuilder.java b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndexBuilder.java index e78b2ceae1..80b5401cbd 100644 --- a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndexBuilder.java +++ b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndexBuilder.java @@ -616,6 +616,9 @@ public static ColumnIndex build( builder.fill(nullPages, nullCounts, minValues, maxValues, repLevelHistogram, defLevelHistogram); ColumnIndexBase columnIndex = builder.build(type); + if (columnIndex == null) { + return null; + } columnIndex.boundaryOrder = requireNonNull(boundaryOrder); return columnIndex; } @@ -750,6 +753,19 @@ private ColumnIndexBase build(PrimitiveType type) { if (!nullCounts.isEmpty()) { columnIndex.nullCounts = nullCounts.toLongArray(); } + // A null_pages entry of true indicates the page consists entirely of null values. + // When null_counts is present, a null_counts entry of zero means the page has no + // null values at all. These two fields directly contradict each other: a page + // cannot both consist entirely of nulls and contain zero nulls. Since null_pages + // is the field that causes the reader to skip pages during column index filtering, + // the safe response is to discard the column index for this column entirely. + if (columnIndex.nullCounts != null) { + for (int i = 0; i < columnIndex.nullPages.length; i++) { + if (columnIndex.nullPages[i] && columnIndex.nullCounts[i] == 0L) { + return null; + } + } + } columnIndex.pageIndexes = pageIndexes.toIntArray(); // Repetition and definition level histograms are optional so keep them null if the builder has no values if (repLevelHistogram != null && !repLevelHistogram.isEmpty()) { diff --git a/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestColumnIndexBuilder.java b/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestColumnIndexBuilder.java index 6274f36263..00d44c3800 100644 --- a/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestColumnIndexBuilder.java +++ b/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestColumnIndexBuilder.java @@ -1865,4 +1865,152 @@ long getMinMaxSize() { private static void assertCorrectFiltering(ColumnIndex ci, FilterPredicate predicate, int... expectedIndexes) { TestIndexIterator.assertEquals(predicate.accept(ci), expectedIndexes); } + + @Test + public void testBuildReturnsNullForNullPageCountContradiction() { + // null_pages[i]=true indicates the page is entirely null, but null_counts[i]=0 + // says there are zero null values. This contradiction indicates invalid metadata. + // build() should return null to prevent incorrect page skipping. + PrimitiveType type = Types.required(INT32).named("test_col"); + + // Pages 1-3 have null_pages=true with null_counts=0 — contradictory + assertNull( + "Column index with null_pages=true and null_counts=0 should be rejected", + ColumnIndexBuilder.build( + type, + BoundaryOrder.ASCENDING, + List.of(false, true, true, true), + List.of(0L, 0L, 0L, 0L), + toBBList(Integer.valueOf(-99), null, null, null), + toBBList(Integer.valueOf(5), null, null, null))); + + // Contradiction on a single page (last page) should also be rejected + assertNull( + "Single contradictory page should cause rejection", + ColumnIndexBuilder.build( + type, + BoundaryOrder.UNORDERED, + List.of(false, false, true), + List.of(0L, 5L, 0L), + toBBList(Integer.valueOf(1), Integer.valueOf(50), null), + toBBList(Integer.valueOf(49), Integer.valueOf(99), null))); + + // Contradiction on the first page + assertNull( + "Contradictory first page should cause rejection", + ColumnIndexBuilder.build( + type, + BoundaryOrder.UNORDERED, + List.of(true, false, false), + List.of(0L, 0L, 5L), + toBBList(null, Integer.valueOf(1), Integer.valueOf(50)), + toBBList(null, Integer.valueOf(49), Integer.valueOf(99)))); + + // Single page with contradiction + assertNull( + "Single-page column index with contradiction should be rejected", + ColumnIndexBuilder.build( + type, + BoundaryOrder.UNORDERED, + List.of(true), + List.of(0L), + toBBList((Integer) null), + toBBList((Integer) null))); + + // All pages are contradictory null pages + assertNull( + "All-contradictory column index should be rejected", + ColumnIndexBuilder.build( + type, + BoundaryOrder.UNORDERED, + List.of(true, true, true), + List.of(0L, 0L, 0L), + toBBList((Integer) null, null, null), + toBBList((Integer) null, null, null))); + } + + @Test + public void testBuildPreservesValidColumnIndex() { + PrimitiveType type = Types.required(INT32).named("test_col"); + + // Legitimate null page: null_pages=true with null_counts > 0 — valid + ColumnIndex ci = ColumnIndexBuilder.build( + type, + BoundaryOrder.ASCENDING, + List.of(false, true, false), + List.of(0L, 100L, 0L), + toBBList(Integer.valueOf(1), null, Integer.valueOf(50)), + toBBList(Integer.valueOf(49), null, Integer.valueOf(99))); + assertCorrectNullPages(ci, false, true, false); + assertCorrectNullCounts(ci, 0, 100, 0); + assertCorrectValues(ci.getMinValues(), 1, null, 50); + assertCorrectValues(ci.getMaxValues(), 49, null, 99); + + // All non-null pages — valid + ColumnIndex ci2 = ColumnIndexBuilder.build( + type, + BoundaryOrder.ASCENDING, + List.of(false, false, false), + List.of(0L, 5L, 10L), + toBBList(Integer.valueOf(1), Integer.valueOf(50), Integer.valueOf(100)), + toBBList(Integer.valueOf(49), Integer.valueOf(99), Integer.valueOf(150))); + assertCorrectNullPages(ci2, false, false, false); + assertCorrectNullCounts(ci2, 0, 5, 10); + + // Single non-null page + ColumnIndex ci3 = ColumnIndexBuilder.build( + type, + BoundaryOrder.UNORDERED, + List.of(false), + List.of(0L), + toBBList(Integer.valueOf(42)), + toBBList(Integer.valueOf(42))); + assertCorrectNullPages(ci3, false); + assertCorrectNullCounts(ci3, 0); + + // Single legitimate all-null page (null_pages=true, null_counts > 0) + ColumnIndex ci4 = ColumnIndexBuilder.build( + type, BoundaryOrder.UNORDERED, List.of(true), List.of(50L), toBBList((Integer) null), toBBList((Integer) + null)); + assertCorrectNullPages(ci4, true); + assertCorrectNullCounts(ci4, 50); + + // All pages legitimately null + ColumnIndex ci5 = ColumnIndexBuilder.build( + type, + BoundaryOrder.UNORDERED, + List.of(true, true, true), + List.of(10L, 20L, 30L), + toBBList((Integer) null, null, null), + toBBList((Integer) null, null, null)); + assertCorrectNullPages(ci5, true, true, true); + assertCorrectNullCounts(ci5, 10, 20, 30); + + // Boundary: null_counts=1 on a null page (minimum valid value) — should NOT be rejected + ColumnIndex ci6 = ColumnIndexBuilder.build( + type, + BoundaryOrder.UNORDERED, + List.of(false, true), + List.of(0L, 1L), + toBBList(Integer.valueOf(1), null), + toBBList(Integer.valueOf(99), null)); + assertCorrectNullPages(ci6, false, true); + assertCorrectNullCounts(ci6, 0, 1); + } + + @Test + public void testBuildWithoutNullCountsIsNotRejected() { + PrimitiveType type = Types.required(INT32).named("test_col"); + + // null_counts absent (optional field) — cannot detect contradiction, should build normally + ColumnIndex ci = ColumnIndexBuilder.build( + type, + BoundaryOrder.UNORDERED, + List.of(false, true, true), + null, + toBBList(Integer.valueOf(1), null, null), + toBBList(Integer.valueOf(99), null, null)); + assertCorrectNullPages(ci, false, true, true); + assertNull("null_counts should be null when not provided", ci.getNullCounts()); + } }