Skip to content

Commit 5e67a4e

Browse files
committed
Fix trim! race where concurrent adds can violate n <= vec-length.
trim! reads n at the top of its loop, then uses that stale n to size the new smaller vec. If concurrent adds push n past the target capacity without changing vec (because the old vec has room), trim!'s CAS on vec succeeds, installing a vec too small for the current n. Fix: re-read n before creating the new vec and abort if n > new-cap. As a safety net, after CAS succeeds, verify n <= new-cap and grow back via ensure-free-space! if a concurrent add slipped through. Also switch maybe-sleep to tagged form so each stress test activates only the sleep point relevant to the invariant it tests.
1 parent e6a90b7 commit 5e67a4e

2 files changed

Lines changed: 199 additions & 90 deletions

File tree

data-lib/data/gvector.rkt

Lines changed: 39 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,12 @@
3030
;; Without CAS, thread T1 could install a small vector while T2
3131
;; installs a large n, violating the invariant.
3232
;; Similarly, trim! uses CAS when installing a smaller vector.
33+
;;
34+
;; - n re-read in trim!:
35+
;; trim! re-reads n before creating the smaller vector. If n has
36+
;; grown past the target capacity (from concurrent adds that didn't
37+
;; need to resize vec), trim! aborts. After CAS succeeds, trim!
38+
;; checks again and grows back via ensure-free-space! if needed.
3339
(require (for-syntax racket/base
3440
syntax/contract
3541
syntax/for-body)
@@ -43,14 +49,19 @@
4349
racket/struct)
4450

4551
;; When the environment variable GVECTOR_SLEEP is set at compile time,
46-
;; (maybe-sleep) expands to (sleep 0), widening race windows so that
47-
;; stress tests can exercise the CAS retry paths. When unset, it
48-
;; expands to (void) with no runtime cost.
52+
;; (maybe-sleep tag) expands to (sleep 0.01), widening the race window
53+
;; at the specified point so stress tests can exercise CAS retry paths.
54+
;; GVECTOR_SLEEP=1 activates all sleep points; GVECTOR_SLEEP=tag
55+
;; activates only the matching point. When unset, expands to (void).
4956
(define-syntax (maybe-sleep stx)
5057
(syntax-case stx ()
51-
[(_) (if (getenv "GVECTOR_SLEEP")
52-
#'(sleep 0)
53-
#'(void))]))
58+
[(_ tag)
59+
(let ([env (getenv "GVECTOR_SLEEP")])
60+
(if (and env
61+
(or (equal? env "1")
62+
(equal? env (symbol->string (syntax-e #'tag)))))
63+
#'(sleep 0.01)
64+
#'(void)))]))
5465

5566
(define DEFAULT-CAPACITY 10)
5667

@@ -103,7 +114,7 @@
103114
(define (ensure-free-space! gv needed-free-space)
104115
(let loop ()
105116
(define v (gvector-vec gv))
106-
(maybe-sleep) ; widen window for concurrent growers
117+
(maybe-sleep ensure-space) ; widen window for concurrent growers
107118
(define new-v (grow-vec v (gvector-n gv) needed-free-space))
108119
(when new-v
109120
;; CAS: only install if vec hasn't been replaced by another thread.
@@ -119,7 +130,7 @@
119130
(begin (define n (gvector-n gv))
120131
(define v
121132
(let loop ([v1 (gvector-vec gv)])
122-
(maybe-sleep) ; widen window for concurrent growers
133+
(maybe-sleep ensure-space) ; widen window for concurrent growers
123134
(define v2 (grow-vec v1 n needed-free-space))
124135
(cond [(not v2) v1] ; already big enough
125136
;; CAS: install new vec, retry if another thread changed it
@@ -170,24 +181,34 @@
170181

171182
(define (trim! gv)
172183
;; Invariant (1): n has already been decremented before calling trim!,
173-
;; so installing a smaller vec maintains n <= vector-length(vec).
184+
;; so installing a smaller vec maintains n <= vector-length(vec),
185+
;; provided n hasn't grown since we read it. We re-read n before
186+
;; creating the new vec to abort if concurrent adds have grown n past
187+
;; our target capacity. After CAS, we verify the invariant and grow
188+
;; back if a concurrent add slipped through.
174189
(let loop ()
175-
(define n (gvector-n gv))
190+
(define n0 (gvector-n gv))
176191
(define v (gvector-vec gv))
177-
(maybe-sleep) ; widen window for concurrent add/trim
192+
(maybe-sleep trim) ; widen window for concurrent add/trim
178193
(define cap (vector-length v))
179194
(define new-cap
180195
(let shrink ([new-cap cap])
181-
(cond [(and (>= new-cap (* SHRINK-ON-FACTOR n))
196+
(cond [(and (>= new-cap (* SHRINK-ON-FACTOR n0))
182197
(>= (quotient new-cap SHRINK-BY-FACTOR) SHRINK-MIN))
183198
(shrink (quotient new-cap SHRINK-BY-FACTOR))]
184199
[else new-cap])))
185200
(when (< new-cap cap)
186-
(define new-v (make-vector new-cap #f))
187-
(vector-copy! new-v 0 v 0 n)
188-
;; CAS: only install smaller vec if no other thread replaced vec
189-
(unless (unsafe-struct*-cas! gv 0 v new-v)
190-
(loop)))))
201+
;; Re-read n: if concurrent adds grew n past new-cap, abort.
202+
(define n (gvector-n gv))
203+
(when (<= n new-cap)
204+
(define new-v (make-vector new-cap #f))
205+
(vector-copy! new-v 0 v 0 n)
206+
;; CAS: only install smaller vec if no other thread replaced vec
207+
(when (unsafe-struct*-cas! gv 0 v new-v)
208+
;; Safety net: if a concurrent add pushed n past new-cap
209+
;; between our re-read and the CAS, grow back immediately.
210+
(when (> (gvector-n gv) new-cap)
211+
(ensure-free-space! gv 0)))))))
191212

192213
;; SLOW!
193214
(define (gvector-remove! gv index)
@@ -220,7 +241,7 @@
220241
(unless (exact-nonnegative-integer? index)
221242
(raise-type-error 'gvector-ref "exact nonnegative integer" index))
222243
(let ([v (gvector-vec gv)])
223-
(maybe-sleep) ; widen window for concurrent remove/trim
244+
(maybe-sleep ref) ; widen window for concurrent remove/trim
224245
(if (< index (gvector-n gv))
225246
(unsafe-vector*-ref v index)
226247
(cond [(eq? default none)
Lines changed: 160 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -1,106 +1,194 @@
11
#lang racket/base
2-
;; Stress tests that recompile data/gvector with GVECTOR_SLEEP=1 to
3-
;; widen race windows, exercising the CAS retry paths. The gvector
4-
;; module is loaded in a fresh namespace with compiled file loading
5-
;; disabled, so the sleep-enabled version is never cached.
2+
;; Stress tests that recompile data/gvector with GVECTOR_SLEEP set to
3+
;; widen specific race windows. Each test activates only the sleep
4+
;; point relevant to the invariant it tests, so other operations
5+
;; remain fast. The gvector module is loaded via dynamic-require with
6+
;; compiled file loading disabled, so the sleep-enabled version is
7+
;; never cached.
68

7-
(require rackunit)
9+
(require rackunit
10+
racket/unsafe/ops)
811

9-
(define (dr sym)
10-
;; Set the env var so the compile-time switch in gvector.rkt enables sleeps,
11-
;; and disable loading compiled files so gvector.rkt is recompiled fresh.
12-
(putenv "GVECTOR_SLEEP" "1")
13-
(begin0
14-
(parameterize ([use-compiled-file-paths '()])
15-
(dynamic-require 'data/gvector sym))
16-
(putenv "GVECTOR_SLEEP" "")))
12+
(define (make-gvector-ns tag)
13+
;; Set GVECTOR_SLEEP to the tag so only the matching maybe-sleep
14+
;; expands to (sleep 0.01). Load in a fresh namespace with compiled
15+
;; files disabled. Returns a lookup function for that namespace.
16+
(putenv "GVECTOR_SLEEP" tag)
17+
(define ns (make-base-namespace))
18+
(parameterize ([use-compiled-file-paths '()]
19+
[current-namespace ns])
20+
(dynamic-require 'data/gvector #f))
21+
(putenv "GVECTOR_SLEEP" "")
22+
(lambda (sym)
23+
(parameterize ([use-compiled-file-paths '()]
24+
[current-namespace ns])
25+
(dynamic-require 'data/gvector sym))))
1726

18-
(define gvector?* (dr 'gvector?))
19-
(define make-gvector* (dr 'make-gvector))
20-
(define gvector-add!* (dr 'gvector-add!))
21-
(define gvector-ref* (dr 'gvector-ref))
22-
(define gvector-set!* (dr 'gvector-set!))
23-
(define gvector-count* (dr 'gvector-count))
24-
(define gvector-remove-last!* (dr 'gvector-remove-last!))
27+
(define ref-ns (make-gvector-ns "ref"))
28+
(define trim-ns (make-gvector-ns "trim"))
29+
(define es-ns (make-gvector-ns "ensure-space"))
2530

26-
;; Test: parallel adds don't violate n <= vector-length(vec)
27-
;; This is the exact scenario from rmculpepper's review comment:
28-
;; two concurrent add! calls both snapshot the same n and vec,
29-
;; both create new vectors of different sizes.
30-
;; Without CAS, the smaller vector could win the vec write while
31-
;; the larger n wins the n write.
32-
(test-case "sleep-stress: parallel add invariant"
33-
(for ([iter (in-range 10)])
31+
;; Test 1: vec-before-n ordering in gvector-ref.
32+
;; Only gvector-ref sleeps (tag "ref"); removes run at full speed.
33+
;; A reader reads vec, sleeps 10ms, then reads n. During the sleep,
34+
;; many concurrent removes can decrement n and trigger trim! which
35+
;; replaces vec with a shorter one. With correct vec-before-n ordering,
36+
;; the reader holds the old (large) vec and reads the new (small) n —
37+
;; safe. With wrong n-before-vec ordering, the reader reads the old
38+
;; (large) n and the new (small) vec — out of bounds.
39+
(test-case "sleep-stress: ref + remove (vec-before-n ordering)"
40+
(define make-gvector* (ref-ns 'make-gvector))
41+
(define gvector-add!* (ref-ns 'gvector-add!))
42+
(define gvector-ref* (ref-ns 'gvector-ref))
43+
(define gvector-count* (ref-ns 'gvector-count))
44+
(define gvector-remove-last!* (ref-ns 'gvector-remove-last!))
45+
(for ([iter (in-range 5)])
3446
(define gv (make-gvector*))
35-
;; Fill to capacity to force resize on next add
36-
(for ([i 10])
47+
;; Fill with enough elements that removing triggers trim!
48+
;; trim! fires when cap >= 4*n, so starting at n=1000 with
49+
;; cap=1024, after 744 removes n=256 and cap=1024 >= 4*256,
50+
;; so trim! shrinks to 512. Further removes trigger more shrinks.
51+
(for ([i 1000])
3752
(gvector-add!* gv i))
38-
(define pool (make-parallel-thread-pool 2))
39-
;; T1 adds 1 item (needs vec of ~20), T2 adds many (needs vec of ~1010)
40-
(define t1 (thread #:pool pool (lambda () (gvector-add!* gv 'from-t1))))
41-
(define t2
42-
(thread #:pool pool
43-
(lambda ()
44-
(for ([i (in-range 1000)])
45-
(gvector-add!* gv i)))))
46-
(thread-wait t1)
47-
(thread-wait t2)
48-
(parallel-thread-pool-close pool)
49-
;; The key check: n must not exceed vector length
50-
(define count (gvector-count* gv))
51-
(check-true (> count 10) (format "iter ~a: count should be > 10, got ~a" iter count))))
52-
53-
;; Test: parallel add + ref doesn't read garbage
54-
;; With sleeps between vec-read and n-read in gvector-ref,
55-
;; this exercises the vec-before-n ordering.
56-
(test-case "sleep-stress: parallel add + ref"
57-
(for ([iter (in-range 10)])
58-
(define gv (make-gvector*))
59-
(define stop? #f)
6053
(define pool (make-parallel-thread-pool 4))
61-
(define writers
62-
(for/list ([t (in-range 2)])
63-
(thread #:pool pool
64-
(lambda ()
65-
(for ([i (in-range 500)])
66-
(gvector-add!* gv i))))))
54+
(define stop? #f)
55+
(define found-garbage? #f)
56+
;; Readers that access near the end of the gvector.
57+
;; Each ref takes ~10ms due to the sleep.
58+
;; Check that returned values are valid (fixnums 0-999 or #f).
6759
(define readers
6860
(for/list ([t (in-range 2)])
6961
(thread #:pool pool
7062
(lambda ()
7163
(let loop ()
7264
(define n (gvector-count* gv))
7365
(when (> n 0)
74-
(gvector-ref* gv (sub1 n) #f))
66+
(define val (gvector-ref* gv (sub1 n) #f))
67+
(when (and val (not (and (fixnum? val) (<= 0 val 999))))
68+
(set! found-garbage? #t)))
7569
(unless stop?
7670
(loop)))))))
77-
(for-each thread-wait writers)
71+
;; Removers run at full speed (no sleep in remove/trim)
72+
(define removers
73+
(for/list ([t (in-range 2)])
74+
(thread #:pool pool
75+
(lambda ()
76+
(for ([_ (in-range 400)])
77+
(with-handlers ([exn:fail? void])
78+
(gvector-remove-last!* gv)))))))
79+
(for-each thread-wait removers)
7880
(set! stop? #t)
7981
(parallel-thread-pool-close pool)
80-
(for-each thread-wait readers)))
82+
(for-each thread-wait readers)
83+
(check-false found-garbage?
84+
(format "iter ~a: read garbage from beyond vector bounds" iter))))
8185

82-
;; Test: parallel add + remove doesn't crash
83-
;; With sleeps in both trim! and gvector-ref, this exercises
84-
;; the shrink path where vec is replaced with a shorter vector.
85-
(test-case "sleep-stress: parallel add + remove"
86-
(for ([iter (in-range 10)])
86+
;; Test 2: trim! under concurrent add pressure.
87+
;; Only trim! sleeps (tag "trim"); adds run at full speed.
88+
;; Exercises two protections in trim!:
89+
;; (a) CAS on vec: if a grower installed a larger vec during the sleep,
90+
;; CAS fails and trim! retries rather than clobbering.
91+
;; (b) n re-read: if concurrent adds grew n past the target capacity
92+
;; without changing vec (because it had room), trim! aborts.
93+
;; The invariant violation (n > vector-length(vec)) is transient and
94+
;; hard to observe because adds immediately re-grow, so the test
95+
;; primarily verifies no crashes or memory errors occur.
96+
(test-case "sleep-stress: add + remove (trim! CAS)"
97+
(define make-gvector* (trim-ns 'make-gvector))
98+
(define gvector-add!* (trim-ns 'gvector-add!))
99+
(define gvector-ref* (trim-ns 'gvector-ref))
100+
(define gvector-count* (trim-ns 'gvector-count))
101+
(define gvector-remove-last!* (trim-ns 'gvector-remove-last!))
102+
(for ([iter (in-range 5)])
87103
(define gv (make-gvector*))
88-
(for ([i 200])
104+
;; Start with enough elements that removing triggers trim!
105+
;; n=1000, cap=1024. After ~744 removes, cap >= 4*n triggers shrink.
106+
(for ([i 1000])
89107
(gvector-add!* gv i))
90-
(define pool (make-parallel-thread-pool 4))
108+
(define pool (make-parallel-thread-pool 6))
109+
(define stop? #f)
110+
(define found-garbage? #f)
111+
;; Writers add continuously so they're active during trim!'s sleep.
112+
;; Each trim! sleeps 10ms; writers must be adding during that window
113+
;; so they push n past trim!'s target capacity.
91114
(define writers
92115
(for/list ([t (in-range 2)])
93116
(thread #:pool pool
94117
(lambda ()
95-
(for ([i (in-range 200)])
96-
(gvector-add!* gv i))))))
118+
(let loop ()
119+
(with-handlers ([exn:fail? void])
120+
(gvector-add!* gv 42))
121+
(unless stop?
122+
(loop)))))))
123+
;; Checkers continuously verify n <= vector-length(vec) directly.
124+
;; gvector-ref can't detect this because vec-before-n ordering
125+
;; makes it safe even when the invariant is transiently violated.
126+
;; We read the struct fields directly via unsafe-struct*-ref
127+
;; (vec=field 0, n=field 1) to observe the transient violation.
128+
(define readers
129+
(for/list ([t (in-range 2)])
130+
(thread #:pool pool
131+
(lambda ()
132+
(let loop ()
133+
;; Read vec first, then n. If trim! just installed
134+
;; a small vec and n is still large from adds,
135+
;; we see v=small, n=large => violation.
136+
(define v (unsafe-struct*-ref gv 0))
137+
(define n (unsafe-struct*-ref gv 1))
138+
(when (> n (vector-length v))
139+
(set! found-garbage? #t))
140+
(unless stop?
141+
(loop)))))))
142+
;; Removers trigger trim! which sleeps between reading vec and CAS
97143
(define removers
98144
(for/list ([t (in-range 2)])
99145
(thread #:pool pool
100146
(lambda ()
101-
(for ([_ (in-range 100)])
147+
(for ([_ (in-range 400)])
102148
(with-handlers ([exn:fail? void])
103149
(gvector-remove-last!* gv)))))))
150+
(for-each thread-wait removers)
151+
(set! stop? #t)
104152
(for-each thread-wait writers)
153+
(for-each thread-wait readers)
105154
(parallel-thread-pool-close pool)
106-
(for-each thread-wait removers)))
155+
(check-false found-garbage?
156+
(format "iter ~a: read garbage from beyond vector bounds" iter))))
157+
158+
;; Test 3: CAS in define/ensure-space!.
159+
;; Only ensure-space! sleeps (tag "ensure-space"); other ops fast.
160+
;; Two growers both read the same vec, sleep 10ms, then both try to
161+
;; create and install new vectors. With CAS, only one succeeds and
162+
;; the other retries. Without CAS, both install and the last writer
163+
;; wins — which may be the thread creating the smaller vec while the
164+
;; other thread increments n past that vec's length.
165+
;; Note: see comment at test about reliable detection.
166+
(test-case "sleep-stress: concurrent add (ensure-space! CAS)"
167+
(define make-gvector* (es-ns 'make-gvector))
168+
(define gvector-add!* (es-ns 'gvector-add!))
169+
(define gvector-ref* (es-ns 'gvector-ref))
170+
(define gvector-count* (es-ns 'gvector-count))
171+
(for ([iter (in-range 5)])
172+
(define gv (make-gvector*))
173+
;; Fill to capacity to force resize on next add
174+
(for ([i 10])
175+
(gvector-add!* gv i))
176+
(define pool (make-parallel-thread-pool 2))
177+
;; Use symbols so we can distinguish from garbage memory
178+
(define sentinel (gensym 'val))
179+
(define t1 (thread #:pool pool (lambda () (gvector-add!* gv sentinel))))
180+
(define t2
181+
(thread #:pool pool
182+
(lambda () (apply gvector-add!* gv (build-list 100 (lambda (_) sentinel))))))
183+
(thread-wait t1)
184+
(thread-wait t2)
185+
(parallel-thread-pool-close pool)
186+
;; Verify invariant by reading every element.
187+
;; With a missing CAS, this may read garbage beyond the vector.
188+
;; However, detection is not guaranteed since the thread creating
189+
;; the larger vector tends to win the vec write.
190+
(define count (gvector-count* gv))
191+
(for ([i (in-range count)])
192+
(define val (gvector-ref* gv i))
193+
(check-true (or (fixnum? val) (eq? val sentinel))
194+
(format "iter ~a: garbage at index ~a: ~v" iter i val)))))

0 commit comments

Comments
 (0)