Skip to content

Commit 7fae1ee

Browse files
committed
Fix critical concurrency races, memory management, and portability issues
Round 2 code review fixes addressing findings from 5 focused review agents. CRITICAL fixes: - Bgwriter stale decommit race: BufPoolDrainCondemnedBuffers now validates that status is still BUF_RESIZE_DRAINING and drain_from/drain_to match cached values before decommitting. Prevents MADV_REMOVE from destroying pages that a concurrent grow has reclaimed for active buffers. - GrowBufferPool active descriptor race: Skip reinitialization of descriptors that still have BM_TAG_VALID set from a cancelled shrink, as backends may hold active pins. Such buffers are naturally reused by clock sweep. - EXEC_BACKEND guard: Emit FATAL on platforms using EXEC_BACKEND (Windows) when max_shared_buffers is configured, since fork()-based MAP_SHARED inheritance is required. HIGH fixes: - Switch decommit to MADV_REMOVE for buffer blocks (always page-aligned). MADV_DONTNEED on MAP_SHARED|MAP_ANONYMOUS only zaps PTEs without releasing shmem-backed pages. MADV_REMOVE punches holes in the shmem backing store. Falls back to MADV_DONTNEED if MADV_REMOVE unavailable. - Make BufferPoolCommitMemory incremental: only commit [old, new) range instead of [0, new). Avoids re-touching live pages and ensures rollback on partial failure only affects the new range. MEDIUM fixes: - Remove dead BUF_RESIZE_COMPLETING enum value (never set anywhere) - Fix remaining overcount in drain: only count buffers that fail eviction, not all valid buffers before attempting eviction https://claude.ai/code/session_01BvQguZ27Xd1dpgKCfsuFos
1 parent 854160e commit 7fae1ee

2 files changed

Lines changed: 142 additions & 66 deletions

File tree

src/backend/storage/buffer/buf_resize.c

Lines changed: 138 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,19 @@ BufferPoolReserveMemory(void)
9595
if (MaxNBuffers <= 0 || MaxNBuffers <= NBuffers)
9696
return;
9797

98+
#ifdef EXEC_BACKEND
99+
/*
100+
* On EXEC_BACKEND (Windows), child processes are started via CreateProcess
101+
* rather than fork(), so they do not inherit mmap'd regions. Online
102+
* buffer pool resize requires fork() semantics for shared anonymous
103+
* mappings. Refuse to start rather than silently breaking.
104+
*/
105+
ereport(FATAL,
106+
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
107+
errmsg("max_shared_buffers is not supported on this platform"),
108+
errhint("Remove the max_shared_buffers setting from postgresql.conf.")));
109+
#endif
110+
98111
/*
99112
* Calculate sizes for the maximum possible buffer count.
100113
*/
@@ -165,7 +178,7 @@ BufferPoolReserveMemory(void)
165178
}
166179

167180
/*
168-
* Commit physical memory for the given number of buffers.
181+
* Commit physical memory for buffers in the range [start_buf, end_buf).
169182
*
170183
* When growing, this makes new pages accessible. The memory was already
171184
* reserved by BufferPoolReserveMemory() using MAP_NORESERVE. On Linux,
@@ -175,15 +188,23 @@ BufferPoolReserveMemory(void)
175188
* population with early OOM detection. If unsupported, we fall back to
176189
* manually touching each page to fault it in.
177190
*
191+
* Only the delta range [start_buf, end_buf) is committed, not the entire
192+
* pool. This avoids re-touching already-committed pages and ensures
193+
* rollback on failure only affects the new range (not live buffers).
194+
*
178195
* Returns true on success, false if memory could not be committed (OOM).
179196
*/
180197
bool
181-
BufferPoolCommitMemory(int nbufs)
198+
BufferPoolCommitMemory(int start_buf, int end_buf)
182199
{
183-
Size blocks_size = mul_size((Size) nbufs, BLCKSZ);
184-
Size descs_size = mul_size((Size) nbufs, sizeof(BufferDescPadded));
185-
Size iocv_size = mul_size((Size) nbufs, sizeof(ConditionVariableMinimallyPadded));
186-
Size ckpt_size = mul_size((Size) nbufs, sizeof(CkptSortItem));
200+
Size blocks_off = mul_size((Size) start_buf, BLCKSZ);
201+
Size blocks_len = mul_size((Size) (end_buf - start_buf), BLCKSZ);
202+
Size descs_off = mul_size((Size) start_buf, sizeof(BufferDescPadded));
203+
Size descs_len = mul_size((Size) (end_buf - start_buf), sizeof(BufferDescPadded));
204+
Size iocv_off = mul_size((Size) start_buf, sizeof(ConditionVariableMinimallyPadded));
205+
Size iocv_len = mul_size((Size) (end_buf - start_buf), sizeof(ConditionVariableMinimallyPadded));
206+
Size ckpt_off = mul_size((Size) start_buf, sizeof(CkptSortItem));
207+
Size ckpt_len = mul_size((Size) (end_buf - start_buf), sizeof(CkptSortItem));
187208
bool use_madvise = false;
188209

189210
#ifdef MADV_POPULATE_WRITE
@@ -193,38 +214,37 @@ BufferPoolCommitMemory(int nbufs)
193214
* kernels), fall back to manual page touching.
194215
*
195216
* If population succeeds for some arrays but fails for others, we
196-
* roll back by releasing any already-committed pages with MADV_DONTNEED
197-
* to avoid leaving the pool in an inconsistent state.
217+
* roll back by releasing only the newly-committed pages.
198218
*/
199-
if (madvise(BufferBlocks, blocks_size, MADV_POPULATE_WRITE) == 0)
219+
if (madvise(BufferBlocks + blocks_off, blocks_len, MADV_POPULATE_WRITE) == 0)
200220
{
201221
use_madvise = true;
202222

203-
if (madvise(BufferDescriptors, descs_size, MADV_POPULATE_WRITE) != 0)
223+
if (madvise((char *) BufferDescriptors + descs_off, descs_len,
224+
MADV_POPULATE_WRITE) != 0)
204225
{
205-
/* Roll back blocks */
206-
madvise(BufferBlocks, blocks_size, MADV_DONTNEED);
226+
madvise(BufferBlocks + blocks_off, blocks_len, MADV_DONTNEED);
207227
ereport(WARNING,
208228
(errcode(ERRCODE_OUT_OF_MEMORY),
209229
errmsg("could not commit memory for buffer descriptors: %m")));
210230
return false;
211231
}
212-
if (madvise(BufferIOCVArray, iocv_size, MADV_POPULATE_WRITE) != 0)
232+
if (madvise((char *) BufferIOCVArray + iocv_off, iocv_len,
233+
MADV_POPULATE_WRITE) != 0)
213234
{
214-
/* Roll back blocks + descriptors */
215-
madvise(BufferBlocks, blocks_size, MADV_DONTNEED);
216-
madvise(BufferDescriptors, descs_size, MADV_DONTNEED);
235+
madvise(BufferBlocks + blocks_off, blocks_len, MADV_DONTNEED);
236+
madvise((char *) BufferDescriptors + descs_off, descs_len, MADV_DONTNEED);
217237
ereport(WARNING,
218238
(errcode(ERRCODE_OUT_OF_MEMORY),
219239
errmsg("could not commit memory for buffer IO CVs: %m")));
220240
return false;
221241
}
222-
if (madvise(CkptBufferIds, ckpt_size, MADV_POPULATE_WRITE) != 0)
242+
if (madvise((char *) CkptBufferIds + ckpt_off, ckpt_len,
243+
MADV_POPULATE_WRITE) != 0)
223244
{
224-
/* Roll back blocks + descriptors + IO CVs */
225-
madvise(BufferBlocks, blocks_size, MADV_DONTNEED);
226-
madvise(BufferDescriptors, descs_size, MADV_DONTNEED);
227-
madvise(BufferIOCVArray, iocv_size, MADV_DONTNEED);
245+
madvise(BufferBlocks + blocks_off, blocks_len, MADV_DONTNEED);
246+
madvise((char *) BufferDescriptors + descs_off, descs_len, MADV_DONTNEED);
247+
madvise((char *) BufferIOCVArray + iocv_off, iocv_len, MADV_DONTNEED);
228248
ereport(WARNING,
229249
(errcode(ERRCODE_OUT_OF_MEMORY),
230250
errmsg("could not commit memory for checkpoint buffer IDs: %m")));
@@ -233,10 +253,10 @@ BufferPoolCommitMemory(int nbufs)
233253
}
234254
else if (errno != EINVAL)
235255
{
236-
/* Real error (e.g., ENOMEM), not just unsupported */
237256
ereport(WARNING,
238257
(errcode(ERRCODE_OUT_OF_MEMORY),
239-
errmsg("could not commit memory for %d buffers: %m", nbufs)));
258+
errmsg("could not commit memory for buffers %d..%d: %m",
259+
start_buf, end_buf)));
240260
return false;
241261
}
242262
/* else: EINVAL means MADV_POPULATE_WRITE not supported, fall through */
@@ -251,28 +271,28 @@ BufferPoolCommitMemory(int nbufs)
251271
* Touch one byte per OS page to fault in the physical memory.
252272
* The volatile pointer prevents the compiler from optimizing this away.
253273
*/
254-
for (p = (volatile char *) BufferBlocks;
255-
p < (volatile char *) BufferBlocks + blocks_size;
274+
for (p = (volatile char *) BufferBlocks + blocks_off;
275+
p < (volatile char *) BufferBlocks + blocks_off + blocks_len;
256276
p += page_size)
257277
*p = *p;
258278

259-
for (p = (volatile char *) BufferDescriptors;
260-
p < (volatile char *) BufferDescriptors + descs_size;
279+
for (p = (volatile char *) BufferDescriptors + descs_off;
280+
p < (volatile char *) BufferDescriptors + descs_off + descs_len;
261281
p += page_size)
262282
*p = *p;
263283

264-
for (p = (volatile char *) BufferIOCVArray;
265-
p < (volatile char *) BufferIOCVArray + iocv_size;
284+
for (p = (volatile char *) BufferIOCVArray + iocv_off;
285+
p < (volatile char *) BufferIOCVArray + iocv_off + iocv_len;
266286
p += page_size)
267287
*p = *p;
268288

269-
for (p = (volatile char *) CkptBufferIds;
270-
p < (volatile char *) CkptBufferIds + ckpt_size;
289+
for (p = (volatile char *) CkptBufferIds + ckpt_off;
290+
p < (volatile char *) CkptBufferIds + ckpt_off + ckpt_len;
271291
p += page_size)
272292
*p = *p;
273293

274-
elog(DEBUG1, "committed buffer pool memory via page touching for %d buffers",
275-
nbufs);
294+
elog(DEBUG1, "committed buffer pool memory via page touching for buffers %d..%d",
295+
start_buf, end_buf);
276296
}
277297

278298
return true;
@@ -283,6 +303,16 @@ BufferPoolCommitMemory(int nbufs)
283303
*
284304
* After shrinking, we release physical pages back to the OS but keep the
285305
* virtual address reservation intact for future growth.
306+
*
307+
* For the buffer blocks array (which is always page-aligned since
308+
* BLCKSZ >= page size), we use MADV_REMOVE to punch a hole in the
309+
* shmem backing and actually free the pages. MADV_DONTNEED alone
310+
* is insufficient on MAP_SHARED mappings because it only unmaps PTEs
311+
* without releasing the underlying shmem pages.
312+
*
313+
* For smaller arrays (descriptors, CVs, ckpt IDs), their offsets may
314+
* not be page-aligned, so we use MADV_DONTNEED as a best-effort hint.
315+
* The memory waste from these arrays is small relative to the blocks.
286316
*/
287317
void
288318
BufferPoolDecommitMemory(int old_nbufs, int new_nbufs)
@@ -296,9 +326,24 @@ BufferPoolDecommitMemory(int old_nbufs, int new_nbufs)
296326
Size ckpt_offset = mul_size((Size) new_nbufs, sizeof(CkptSortItem));
297327
Size ckpt_len = mul_size((Size) (old_nbufs - new_nbufs), sizeof(CkptSortItem));
298328

299-
/* Release physical pages back to the OS */
329+
/*
330+
* Release physical pages for buffer blocks. MADV_REMOVE punches a hole
331+
* in the shmem backing store, actually freeing the memory. If it fails
332+
* (e.g., unsupported kernel), fall back to MADV_DONTNEED.
333+
*/
300334
if (blocks_len > 0)
301-
madvise(BufferBlocks + blocks_offset, blocks_len, MADV_DONTNEED);
335+
{
336+
#ifdef MADV_REMOVE
337+
if (madvise(BufferBlocks + blocks_offset, blocks_len, MADV_REMOVE) != 0)
338+
#endif
339+
madvise(BufferBlocks + blocks_offset, blocks_len, MADV_DONTNEED);
340+
}
341+
342+
/*
343+
* For smaller arrays, use MADV_DONTNEED as a best-effort hint.
344+
* These offsets may not be page-aligned, in which case madvise
345+
* silently does nothing (returns EINVAL which we ignore).
346+
*/
302347
if (descs_len > 0)
303348
madvise((char *) BufferDescriptors + descs_offset, descs_len, MADV_DONTNEED);
304349
if (iocv_len > 0)
@@ -377,7 +422,7 @@ GrowBufferPool(int new_nbuffers)
377422
*/
378423
if (ReservedBufferBlocks != NULL)
379424
{
380-
if (!BufferPoolCommitMemory(new_nbuffers))
425+
if (!BufferPoolCommitMemory(old_nbuffers, new_nbuffers))
381426
{
382427
elog(WARNING, "buffer pool grow failed: could not commit memory");
383428
return false;
@@ -390,10 +435,23 @@ GrowBufferPool(int new_nbuffers)
390435
* New buffers are appended at the end, so existing buffers are not
391436
* disturbed. This is safe because no backend can access buffer IDs
392437
* >= old_nbuffers yet (NBuffers hasn't been updated).
438+
*
439+
* However, if a previous shrink was cancelled before its drain completed,
440+
* some descriptors in this range may still have BM_TAG_VALID set and
441+
* could have active pins from backends. We must NOT reinitialize those
442+
* -- doing so would zero the refcount and corrupt the buffer state.
443+
* Such buffers will be naturally reused by the clock sweep once NBuffers
444+
* is updated to include them again.
393445
*/
394446
for (i = old_nbuffers; i < new_nbuffers; i++)
395447
{
396448
BufferDesc *buf = GetBufferDescriptor(i);
449+
uint64 buf_state;
450+
451+
/* Skip buffers still in use from a cancelled shrink */
452+
buf_state = pg_atomic_read_u64(&buf->state);
453+
if (buf_state & BM_TAG_VALID)
454+
continue;
397455

398456
ClearBufferTag(&buf->tag);
399457
pg_atomic_init_u64(&buf->state, 0);
@@ -533,49 +591,68 @@ BufPoolDrainCondemnedBuffers(void)
533591
if (!(buf_state & BM_TAG_VALID))
534592
continue;
535593

536-
remaining++;
537-
538594
/* Can't touch pinned buffers */
539595
if (BUF_STATE_GET_REFCOUNT(buf_state) != 0)
540596
{
597+
remaining++;
541598
pinned++;
542599
continue;
543600
}
544601

545602
/* Evict the buffer (handles dirty flush + invalidation) */
546603
{
547604
bool flushed = false;
605+
bool evicted;
548606

549607
if (buf_state & BM_DIRTY)
550608
dirty++;
551-
(void) EvictUnpinnedBuffer(BufferDescriptorGetBuffer(buf),
552-
&flushed);
609+
evicted = EvictUnpinnedBuffer(BufferDescriptorGetBuffer(buf),
610+
&flushed);
611+
if (!evicted)
612+
remaining++;
553613
}
554614
}
555615

556-
/* Update progress */
616+
/* Update progress under lock */
557617
SpinLockAcquire(&BufResizeCtl->mutex);
558618
BufResizeCtl->condemned_remaining = remaining;
559619
BufResizeCtl->condemned_pinned = pinned;
560620
BufResizeCtl->condemned_dirty = dirty;
561621

562622
if (remaining == 0)
563623
{
564-
/* All condemned buffers drained */
565-
BufResizeCtl->status = BUF_RESIZE_IDLE;
566-
BufResizeCtl->drain_from = 0;
567-
BufResizeCtl->drain_to = 0;
568-
BufResizeCtl->started_at = 0;
569-
BufResizeCtl->condemned_remaining = 0;
570-
BufResizeCtl->condemned_pinned = 0;
571-
BufResizeCtl->condemned_dirty = 0;
572-
SpinLockRelease(&BufResizeCtl->mutex);
573-
574-
elog(LOG, "bgwriter: condemned buffer drain complete");
575-
576-
/* Now safe to decommit memory */
577-
if (ReservedBufferBlocks != NULL)
578-
BufferPoolDecommitMemory(drain_to, drain_from);
624+
/*
625+
* All condemned buffers drained. Before decommitting, verify the
626+
* drain hasn't been superseded by a new resize request. A grow
627+
* that overlaps the condemned range could have been initiated by
628+
* the postmaster while we were iterating -- in that case, the
629+
* status and/or drain range will have changed under us.
630+
*/
631+
if (BufResizeCtl->status == BUF_RESIZE_DRAINING &&
632+
BufResizeCtl->drain_from == drain_from &&
633+
BufResizeCtl->drain_to == drain_to)
634+
{
635+
BufResizeCtl->status = BUF_RESIZE_IDLE;
636+
BufResizeCtl->drain_from = 0;
637+
BufResizeCtl->drain_to = 0;
638+
BufResizeCtl->started_at = 0;
639+
BufResizeCtl->condemned_remaining = 0;
640+
BufResizeCtl->condemned_pinned = 0;
641+
BufResizeCtl->condemned_dirty = 0;
642+
SpinLockRelease(&BufResizeCtl->mutex);
643+
644+
elog(LOG, "bgwriter: condemned buffer drain complete");
645+
646+
/* Now safe to decommit memory */
647+
if (ReservedBufferBlocks != NULL)
648+
BufferPoolDecommitMemory(drain_to, drain_from);
649+
}
650+
else
651+
{
652+
/* Drain was superseded; skip decommit */
653+
SpinLockRelease(&BufResizeCtl->mutex);
654+
elog(LOG, "bgwriter: drain superseded by new resize, skipping decommit");
655+
}
579656
}
580657
else
581658
{
@@ -605,13 +682,13 @@ RequestBufferPoolResize(int new_nbuffers)
605682
/*
606683
* If a bgwriter drain is in progress (BUF_RESIZE_DRAINING from a
607684
* previous shrink), cancel it -- the new request supersedes. The
608-
* orphaned condemned buffers are harmless (just waste some memory).
685+
* bgwriter validates the drain range before decommitting, so it's
686+
* safe to change the range while it's iterating.
609687
*
610-
* Don't interrupt a grow (BUF_RESIZE_GROWING/COMPLETING) since the
611-
* postmaster is actively executing it.
688+
* Don't interrupt a grow (BUF_RESIZE_GROWING) since the postmaster
689+
* is actively executing it.
612690
*/
613-
if (BufResizeCtl->status == BUF_RESIZE_GROWING ||
614-
BufResizeCtl->status == BUF_RESIZE_COMPLETING)
691+
if (BufResizeCtl->status == BUF_RESIZE_GROWING)
615692
{
616693
SpinLockRelease(&BufResizeCtl->mutex);
617694
ereport(WARNING,

src/include/storage/buf_resize.h

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,7 @@ typedef enum BufPoolResizeStatus
2727
{
2828
BUF_RESIZE_IDLE = 0, /* No resize in progress */
2929
BUF_RESIZE_GROWING, /* Adding new buffers */
30-
BUF_RESIZE_DRAINING, /* Draining condemned buffers for shrink */
31-
BUF_RESIZE_COMPLETING /* Completing resize, children updating */
30+
BUF_RESIZE_DRAINING /* Draining condemned buffers for shrink */
3231
} BufPoolResizeStatus;
3332

3433
/*
@@ -83,10 +82,10 @@ extern void BufPoolResizeShmemInit(void);
8382
extern void BufferPoolReserveMemory(void);
8483

8584
/*
86-
* Commit physical memory for the given number of buffers within
87-
* the previously reserved address space.
85+
* Commit physical memory for buffers in the range [start_buf, end_buf)
86+
* within the previously reserved address space.
8887
*/
89-
extern bool BufferPoolCommitMemory(int nbufs);
88+
extern bool BufferPoolCommitMemory(int start_buf, int end_buf);
9089

9190
/*
9291
* Decommit physical memory for buffers beyond the given count.

0 commit comments

Comments
 (0)