@@ -95,6 +95,19 @@ BufferPoolReserveMemory(void)
9595 if (MaxNBuffers <= 0 || MaxNBuffers <= NBuffers )
9696 return ;
9797
98+ #ifdef EXEC_BACKEND
99+ /*
100+ * On EXEC_BACKEND (Windows), child processes are started via CreateProcess
101+ * rather than fork(), so they do not inherit mmap'd regions. Online
102+ * buffer pool resize requires fork() semantics for shared anonymous
103+ * mappings. Refuse to start rather than silently breaking.
104+ */
105+ ereport (FATAL ,
106+ (errcode (ERRCODE_FEATURE_NOT_SUPPORTED ),
107+ errmsg ("max_shared_buffers is not supported on this platform" ),
108+ errhint ("Remove the max_shared_buffers setting from postgresql.conf." )));
109+ #endif
110+
98111 /*
99112 * Calculate sizes for the maximum possible buffer count.
100113 */
@@ -165,7 +178,7 @@ BufferPoolReserveMemory(void)
165178}
166179
167180/*
168- * Commit physical memory for the given number of buffers .
181+ * Commit physical memory for buffers in the range [start_buf, end_buf) .
169182 *
170183 * When growing, this makes new pages accessible. The memory was already
171184 * reserved by BufferPoolReserveMemory() using MAP_NORESERVE. On Linux,
@@ -175,15 +188,23 @@ BufferPoolReserveMemory(void)
175188 * population with early OOM detection. If unsupported, we fall back to
176189 * manually touching each page to fault it in.
177190 *
191+ * Only the delta range [start_buf, end_buf) is committed, not the entire
192+ * pool. This avoids re-touching already-committed pages and ensures
193+ * rollback on failure only affects the new range (not live buffers).
194+ *
178195 * Returns true on success, false if memory could not be committed (OOM).
179196 */
180197bool
181- BufferPoolCommitMemory (int nbufs )
198+ BufferPoolCommitMemory (int start_buf , int end_buf )
182199{
183- Size blocks_size = mul_size ((Size ) nbufs , BLCKSZ );
184- Size descs_size = mul_size ((Size ) nbufs , sizeof (BufferDescPadded ));
185- Size iocv_size = mul_size ((Size ) nbufs , sizeof (ConditionVariableMinimallyPadded ));
186- Size ckpt_size = mul_size ((Size ) nbufs , sizeof (CkptSortItem ));
200+ Size blocks_off = mul_size ((Size ) start_buf , BLCKSZ );
201+ Size blocks_len = mul_size ((Size ) (end_buf - start_buf ), BLCKSZ );
202+ Size descs_off = mul_size ((Size ) start_buf , sizeof (BufferDescPadded ));
203+ Size descs_len = mul_size ((Size ) (end_buf - start_buf ), sizeof (BufferDescPadded ));
204+ Size iocv_off = mul_size ((Size ) start_buf , sizeof (ConditionVariableMinimallyPadded ));
205+ Size iocv_len = mul_size ((Size ) (end_buf - start_buf ), sizeof (ConditionVariableMinimallyPadded ));
206+ Size ckpt_off = mul_size ((Size ) start_buf , sizeof (CkptSortItem ));
207+ Size ckpt_len = mul_size ((Size ) (end_buf - start_buf ), sizeof (CkptSortItem ));
187208 bool use_madvise = false;
188209
189210#ifdef MADV_POPULATE_WRITE
@@ -193,38 +214,37 @@ BufferPoolCommitMemory(int nbufs)
193214 * kernels), fall back to manual page touching.
194215 *
195216 * If population succeeds for some arrays but fails for others, we
196- * roll back by releasing any already-committed pages with MADV_DONTNEED
197- * to avoid leaving the pool in an inconsistent state.
217+ * roll back by releasing only the newly-committed pages.
198218 */
199- if (madvise (BufferBlocks , blocks_size , MADV_POPULATE_WRITE ) == 0 )
219+ if (madvise (BufferBlocks + blocks_off , blocks_len , MADV_POPULATE_WRITE ) == 0 )
200220 {
201221 use_madvise = true;
202222
203- if (madvise (BufferDescriptors , descs_size , MADV_POPULATE_WRITE ) != 0 )
223+ if (madvise ((char * ) BufferDescriptors + descs_off , descs_len ,
224+ MADV_POPULATE_WRITE ) != 0 )
204225 {
205- /* Roll back blocks */
206- madvise (BufferBlocks , blocks_size , MADV_DONTNEED );
226+ madvise (BufferBlocks + blocks_off , blocks_len , MADV_DONTNEED );
207227 ereport (WARNING ,
208228 (errcode (ERRCODE_OUT_OF_MEMORY ),
209229 errmsg ("could not commit memory for buffer descriptors: %m" )));
210230 return false;
211231 }
212- if (madvise (BufferIOCVArray , iocv_size , MADV_POPULATE_WRITE ) != 0 )
232+ if (madvise ((char * ) BufferIOCVArray + iocv_off , iocv_len ,
233+ MADV_POPULATE_WRITE ) != 0 )
213234 {
214- /* Roll back blocks + descriptors */
215- madvise (BufferBlocks , blocks_size , MADV_DONTNEED );
216- madvise (BufferDescriptors , descs_size , MADV_DONTNEED );
235+ madvise (BufferBlocks + blocks_off , blocks_len , MADV_DONTNEED );
236+ madvise ((char * ) BufferDescriptors + descs_off , descs_len , MADV_DONTNEED );
217237 ereport (WARNING ,
218238 (errcode (ERRCODE_OUT_OF_MEMORY ),
219239 errmsg ("could not commit memory for buffer IO CVs: %m" )));
220240 return false;
221241 }
222- if (madvise (CkptBufferIds , ckpt_size , MADV_POPULATE_WRITE ) != 0 )
242+ if (madvise ((char * ) CkptBufferIds + ckpt_off , ckpt_len ,
243+ MADV_POPULATE_WRITE ) != 0 )
223244 {
224- /* Roll back blocks + descriptors + IO CVs */
225- madvise (BufferBlocks , blocks_size , MADV_DONTNEED );
226- madvise (BufferDescriptors , descs_size , MADV_DONTNEED );
227- madvise (BufferIOCVArray , iocv_size , MADV_DONTNEED );
245+ madvise (BufferBlocks + blocks_off , blocks_len , MADV_DONTNEED );
246+ madvise ((char * ) BufferDescriptors + descs_off , descs_len , MADV_DONTNEED );
247+ madvise ((char * ) BufferIOCVArray + iocv_off , iocv_len , MADV_DONTNEED );
228248 ereport (WARNING ,
229249 (errcode (ERRCODE_OUT_OF_MEMORY ),
230250 errmsg ("could not commit memory for checkpoint buffer IDs: %m" )));
@@ -233,10 +253,10 @@ BufferPoolCommitMemory(int nbufs)
233253 }
234254 else if (errno != EINVAL )
235255 {
236- /* Real error (e.g., ENOMEM), not just unsupported */
237256 ereport (WARNING ,
238257 (errcode (ERRCODE_OUT_OF_MEMORY ),
239- errmsg ("could not commit memory for %d buffers: %m" , nbufs )));
258+ errmsg ("could not commit memory for buffers %d..%d: %m" ,
259+ start_buf , end_buf )));
240260 return false;
241261 }
242262 /* else: EINVAL means MADV_POPULATE_WRITE not supported, fall through */
@@ -251,28 +271,28 @@ BufferPoolCommitMemory(int nbufs)
251271 * Touch one byte per OS page to fault in the physical memory.
252272 * The volatile pointer prevents the compiler from optimizing this away.
253273 */
254- for (p = (volatile char * ) BufferBlocks ;
255- p < (volatile char * ) BufferBlocks + blocks_size ;
274+ for (p = (volatile char * ) BufferBlocks + blocks_off ;
275+ p < (volatile char * ) BufferBlocks + blocks_off + blocks_len ;
256276 p += page_size )
257277 * p = * p ;
258278
259- for (p = (volatile char * ) BufferDescriptors ;
260- p < (volatile char * ) BufferDescriptors + descs_size ;
279+ for (p = (volatile char * ) BufferDescriptors + descs_off ;
280+ p < (volatile char * ) BufferDescriptors + descs_off + descs_len ;
261281 p += page_size )
262282 * p = * p ;
263283
264- for (p = (volatile char * ) BufferIOCVArray ;
265- p < (volatile char * ) BufferIOCVArray + iocv_size ;
284+ for (p = (volatile char * ) BufferIOCVArray + iocv_off ;
285+ p < (volatile char * ) BufferIOCVArray + iocv_off + iocv_len ;
266286 p += page_size )
267287 * p = * p ;
268288
269- for (p = (volatile char * ) CkptBufferIds ;
270- p < (volatile char * ) CkptBufferIds + ckpt_size ;
289+ for (p = (volatile char * ) CkptBufferIds + ckpt_off ;
290+ p < (volatile char * ) CkptBufferIds + ckpt_off + ckpt_len ;
271291 p += page_size )
272292 * p = * p ;
273293
274- elog (DEBUG1 , "committed buffer pool memory via page touching for %d buffers " ,
275- nbufs );
294+ elog (DEBUG1 , "committed buffer pool memory via page touching for buffers %d..%d " ,
295+ start_buf , end_buf );
276296 }
277297
278298 return true;
@@ -283,6 +303,16 @@ BufferPoolCommitMemory(int nbufs)
283303 *
284304 * After shrinking, we release physical pages back to the OS but keep the
285305 * virtual address reservation intact for future growth.
306+ *
307+ * For the buffer blocks array (which is always page-aligned since
308+ * BLCKSZ >= page size), we use MADV_REMOVE to punch a hole in the
309+ * shmem backing and actually free the pages. MADV_DONTNEED alone
310+ * is insufficient on MAP_SHARED mappings because it only unmaps PTEs
311+ * without releasing the underlying shmem pages.
312+ *
313+ * For smaller arrays (descriptors, CVs, ckpt IDs), their offsets may
314+ * not be page-aligned, so we use MADV_DONTNEED as a best-effort hint.
315+ * The memory waste from these arrays is small relative to the blocks.
286316 */
287317void
288318BufferPoolDecommitMemory (int old_nbufs , int new_nbufs )
@@ -296,9 +326,24 @@ BufferPoolDecommitMemory(int old_nbufs, int new_nbufs)
296326 Size ckpt_offset = mul_size ((Size ) new_nbufs , sizeof (CkptSortItem ));
297327 Size ckpt_len = mul_size ((Size ) (old_nbufs - new_nbufs ), sizeof (CkptSortItem ));
298328
299- /* Release physical pages back to the OS */
329+ /*
330+ * Release physical pages for buffer blocks. MADV_REMOVE punches a hole
331+ * in the shmem backing store, actually freeing the memory. If it fails
332+ * (e.g., unsupported kernel), fall back to MADV_DONTNEED.
333+ */
300334 if (blocks_len > 0 )
301- madvise (BufferBlocks + blocks_offset , blocks_len , MADV_DONTNEED );
335+ {
336+ #ifdef MADV_REMOVE
337+ if (madvise (BufferBlocks + blocks_offset , blocks_len , MADV_REMOVE ) != 0 )
338+ #endif
339+ madvise (BufferBlocks + blocks_offset , blocks_len , MADV_DONTNEED );
340+ }
341+
342+ /*
343+ * For smaller arrays, use MADV_DONTNEED as a best-effort hint.
344+ * These offsets may not be page-aligned, in which case madvise
345+ * silently does nothing (returns EINVAL which we ignore).
346+ */
302347 if (descs_len > 0 )
303348 madvise ((char * ) BufferDescriptors + descs_offset , descs_len , MADV_DONTNEED );
304349 if (iocv_len > 0 )
@@ -377,7 +422,7 @@ GrowBufferPool(int new_nbuffers)
377422 */
378423 if (ReservedBufferBlocks != NULL )
379424 {
380- if (!BufferPoolCommitMemory (new_nbuffers ))
425+ if (!BufferPoolCommitMemory (old_nbuffers , new_nbuffers ))
381426 {
382427 elog (WARNING , "buffer pool grow failed: could not commit memory" );
383428 return false;
@@ -390,10 +435,23 @@ GrowBufferPool(int new_nbuffers)
390435 * New buffers are appended at the end, so existing buffers are not
391436 * disturbed. This is safe because no backend can access buffer IDs
392437 * >= old_nbuffers yet (NBuffers hasn't been updated).
438+ *
439+ * However, if a previous shrink was cancelled before its drain completed,
440+ * some descriptors in this range may still have BM_TAG_VALID set and
441+ * could have active pins from backends. We must NOT reinitialize those
442+ * -- doing so would zero the refcount and corrupt the buffer state.
443+ * Such buffers will be naturally reused by the clock sweep once NBuffers
444+ * is updated to include them again.
393445 */
394446 for (i = old_nbuffers ; i < new_nbuffers ; i ++ )
395447 {
396448 BufferDesc * buf = GetBufferDescriptor (i );
449+ uint64 buf_state ;
450+
451+ /* Skip buffers still in use from a cancelled shrink */
452+ buf_state = pg_atomic_read_u64 (& buf -> state );
453+ if (buf_state & BM_TAG_VALID )
454+ continue ;
397455
398456 ClearBufferTag (& buf -> tag );
399457 pg_atomic_init_u64 (& buf -> state , 0 );
@@ -533,49 +591,68 @@ BufPoolDrainCondemnedBuffers(void)
533591 if (!(buf_state & BM_TAG_VALID ))
534592 continue ;
535593
536- remaining ++ ;
537-
538594 /* Can't touch pinned buffers */
539595 if (BUF_STATE_GET_REFCOUNT (buf_state ) != 0 )
540596 {
597+ remaining ++ ;
541598 pinned ++ ;
542599 continue ;
543600 }
544601
545602 /* Evict the buffer (handles dirty flush + invalidation) */
546603 {
547604 bool flushed = false;
605+ bool evicted ;
548606
549607 if (buf_state & BM_DIRTY )
550608 dirty ++ ;
551- (void ) EvictUnpinnedBuffer (BufferDescriptorGetBuffer (buf ),
552- & flushed );
609+ evicted = EvictUnpinnedBuffer (BufferDescriptorGetBuffer (buf ),
610+ & flushed );
611+ if (!evicted )
612+ remaining ++ ;
553613 }
554614 }
555615
556- /* Update progress */
616+ /* Update progress under lock */
557617 SpinLockAcquire (& BufResizeCtl -> mutex );
558618 BufResizeCtl -> condemned_remaining = remaining ;
559619 BufResizeCtl -> condemned_pinned = pinned ;
560620 BufResizeCtl -> condemned_dirty = dirty ;
561621
562622 if (remaining == 0 )
563623 {
564- /* All condemned buffers drained */
565- BufResizeCtl -> status = BUF_RESIZE_IDLE ;
566- BufResizeCtl -> drain_from = 0 ;
567- BufResizeCtl -> drain_to = 0 ;
568- BufResizeCtl -> started_at = 0 ;
569- BufResizeCtl -> condemned_remaining = 0 ;
570- BufResizeCtl -> condemned_pinned = 0 ;
571- BufResizeCtl -> condemned_dirty = 0 ;
572- SpinLockRelease (& BufResizeCtl -> mutex );
573-
574- elog (LOG , "bgwriter: condemned buffer drain complete" );
575-
576- /* Now safe to decommit memory */
577- if (ReservedBufferBlocks != NULL )
578- BufferPoolDecommitMemory (drain_to , drain_from );
624+ /*
625+ * All condemned buffers drained. Before decommitting, verify the
626+ * drain hasn't been superseded by a new resize request. A grow
627+ * that overlaps the condemned range could have been initiated by
628+ * the postmaster while we were iterating -- in that case, the
629+ * status and/or drain range will have changed under us.
630+ */
631+ if (BufResizeCtl -> status == BUF_RESIZE_DRAINING &&
632+ BufResizeCtl -> drain_from == drain_from &&
633+ BufResizeCtl -> drain_to == drain_to )
634+ {
635+ BufResizeCtl -> status = BUF_RESIZE_IDLE ;
636+ BufResizeCtl -> drain_from = 0 ;
637+ BufResizeCtl -> drain_to = 0 ;
638+ BufResizeCtl -> started_at = 0 ;
639+ BufResizeCtl -> condemned_remaining = 0 ;
640+ BufResizeCtl -> condemned_pinned = 0 ;
641+ BufResizeCtl -> condemned_dirty = 0 ;
642+ SpinLockRelease (& BufResizeCtl -> mutex );
643+
644+ elog (LOG , "bgwriter: condemned buffer drain complete" );
645+
646+ /* Now safe to decommit memory */
647+ if (ReservedBufferBlocks != NULL )
648+ BufferPoolDecommitMemory (drain_to , drain_from );
649+ }
650+ else
651+ {
652+ /* Drain was superseded; skip decommit */
653+ SpinLockRelease (& BufResizeCtl -> mutex );
654+ elog (LOG , "bgwriter: drain superseded by new resize, skipping decommit" );
655+ }
579656 }
580657 else
581658 {
@@ -605,13 +682,13 @@ RequestBufferPoolResize(int new_nbuffers)
605682 /*
606683 * If a bgwriter drain is in progress (BUF_RESIZE_DRAINING from a
607684 * previous shrink), cancel it -- the new request supersedes. The
608- * orphaned condemned buffers are harmless (just waste some memory).
685+ * bgwriter validates the drain range before decommitting, so it's
686+ * safe to change the range while it's iterating.
609687 *
610- * Don't interrupt a grow (BUF_RESIZE_GROWING/COMPLETING ) since the
611- * postmaster is actively executing it.
688+ * Don't interrupt a grow (BUF_RESIZE_GROWING) since the postmaster
689+ * is actively executing it.
612690 */
613- if (BufResizeCtl -> status == BUF_RESIZE_GROWING ||
614- BufResizeCtl -> status == BUF_RESIZE_COMPLETING )
691+ if (BufResizeCtl -> status == BUF_RESIZE_GROWING )
615692 {
616693 SpinLockRelease (& BufResizeCtl -> mutex );
617694 ereport (WARNING ,
0 commit comments