Skip to content

Commit 2e574ea

Browse files
committed
regex: more aggressively limit regex searches
THey are heavy and if they are wide, they take a really long time. Note this limitation in the mcp instructions too.
1 parent c50f648 commit 2e574ea

3 files changed

Lines changed: 155 additions & 65 deletions

File tree

src/db/postgres.rs

Lines changed: 142 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,57 @@ fn resolve_case(plan: &TextSearchPlan) -> CaseSensitivity {
103103
}
104104
}
105105

106+
fn plan_has_regex(plan: &TextSearchPlan) -> bool {
107+
plan.required_terms
108+
.iter()
109+
.chain(plan.excluded_terms.iter())
110+
.any(|term| matches!(term, ContentPredicate::Regex(_)))
111+
}
112+
113+
fn request_has_regex(request: &TextSearchRequest) -> bool {
114+
request.plans.iter().any(plan_has_regex)
115+
}
116+
117+
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
118+
struct SearchBudgets {
119+
fetch_limit: i64,
120+
file_limit: i64,
121+
plan_row_limit: i64,
122+
}
123+
124+
fn compute_search_budgets(request: &TextSearchRequest) -> SearchBudgets {
125+
let page_index = u64::from(request.page);
126+
let page_size = u64::from(request.page_size.max(1));
127+
let (sample_factor, fetch_limit_cap, plan_row_limit) = if request_has_regex(request) {
128+
(
129+
u64::from(REGEX_FILE_SAMPLE_FACTOR.max(1)),
130+
REGEX_FETCH_LIMIT_CAP,
131+
REGEX_PLAN_ROW_LIMIT,
132+
)
133+
} else {
134+
(
135+
u64::from(FILE_SAMPLE_FACTOR.max(1)),
136+
DEFAULT_FETCH_LIMIT_CAP,
137+
DEFAULT_PLAN_ROW_LIMIT,
138+
)
139+
};
140+
let base_limit = page_index
141+
.saturating_add(1)
142+
.saturating_mul(page_size)
143+
.saturating_mul(sample_factor);
144+
let minimum = page_size.saturating_mul(sample_factor);
145+
let fetch_limit_u64 = base_limit.max(minimum).saturating_add(1);
146+
let mut fetch_limit = fetch_limit_u64.min(i64::MAX as u64) as i64;
147+
fetch_limit = fetch_limit.min(fetch_limit_cap);
148+
let file_limit = fetch_limit.min(FILE_LIMIT_CAP);
149+
150+
SearchBudgets {
151+
fetch_limit,
152+
file_limit,
153+
plan_row_limit,
154+
}
155+
}
156+
106157
fn push_search_ctes<'a>(
107158
qb: &mut QueryBuilder<'a, Postgres>,
108159
request: &'a TextSearchRequest,
@@ -151,7 +202,7 @@ fn push_search_ctes<'a>(
151202

152203
let case_mode = resolve_case(plan);
153204
let highlight_case_sensitive = matches!(case_mode, CaseSensitivity::Yes);
154-
let prefer_repo_first = !plan.repos.is_empty();
205+
let seed_repo_first = !plan_has_regex(plan) && !plan.repos.is_empty();
155206

156207
qb.push("(");
157208
qb.push(
@@ -174,7 +225,7 @@ fn push_search_ctes<'a>(
174225
",
175226
);
176227
qb.push_bind(plan.include_historical);
177-
if prefer_repo_first {
228+
if seed_repo_first {
178229
qb.push(
179230
" AS include_historical
180231
FROM (
@@ -288,27 +339,27 @@ fn push_search_ctes<'a>(
288339

289340
qb.push(" WHERE TRUE");
290341

291-
if !prefer_repo_first && !plan.repos.is_empty() {
342+
if !seed_repo_first && !plan.repos.is_empty() {
292343
qb.push(" AND files.repository = ANY(");
293344
qb.push_bind(&plan.repos);
294345
qb.push(")");
295346
}
296347

297-
if !prefer_repo_first && !plan.excluded_repos.is_empty() {
348+
if !seed_repo_first && !plan.excluded_repos.is_empty() {
298349
qb.push(" AND NOT (files.repository = ANY(");
299350
qb.push_bind(&plan.excluded_repos);
300351
qb.push("))");
301352
}
302353

303-
if !prefer_repo_first && !plan.file_globs.is_empty() {
354+
if !seed_repo_first && !plan.file_globs.is_empty() {
304355
for pattern in &plan.file_globs {
305356
qb.push(" AND files.file_path ILIKE ");
306357
qb.push_bind(pattern);
307358
qb.push(" ESCAPE '\\'");
308359
}
309360
}
310361

311-
if !prefer_repo_first && !plan.excluded_file_globs.is_empty() {
362+
if !seed_repo_first && !plan.excluded_file_globs.is_empty() {
312363
for pattern in &plan.excluded_file_globs {
313364
qb.push(" AND files.file_path NOT ILIKE ");
314365
qb.push_bind(pattern);
@@ -1692,19 +1743,11 @@ ORDER BY idx
16921743
));
16931744
}
16941745

1695-
let page_index = u64::from(request.page);
1696-
let page_size = u64::from(request.page_size.max(1));
1697-
let sample_factor = u64::from(FILE_SAMPLE_FACTOR.max(1));
1698-
let base_limit = page_index
1699-
.saturating_add(1)
1700-
.saturating_mul(page_size)
1701-
.saturating_mul(sample_factor);
1702-
let minimum = page_size.saturating_mul(sample_factor);
1703-
let fetch_limit_u64 = base_limit.max(minimum).saturating_add(1);
1704-
let mut fetch_limit = fetch_limit_u64.min(i64::MAX as u64) as i64;
1705-
fetch_limit = fetch_limit.min(5000);
1706-
let file_limit = fetch_limit.min(25000);
1707-
let plan_row_limit: i64 = 5000;
1746+
let SearchBudgets {
1747+
fetch_limit,
1748+
file_limit,
1749+
plan_row_limit,
1750+
} = compute_search_budgets(request);
17081751

17091752
let needs_live_branch_filter = request
17101753
.plans
@@ -1967,8 +2010,8 @@ ORDER BY idx
19672010
let chunk_start_line: i32 = best_row.start_line.try_into().unwrap_or(i32::MAX);
19682011
let best_match_line =
19692012
chunk_start_line.saturating_add(best_row.match_line_number - 1);
1970-
let best_start_line = chunk_start_line
1971-
.saturating_add(best_row.snippet_start_line_number - 1);
2013+
let best_start_line =
2014+
chunk_start_line.saturating_add(best_row.snippet_start_line_number - 1);
19722015
let best_end_line = snippet_end_line(&best_row.content_text, best_start_line);
19732016
let best_match_spans = normalize_literal_match_spans(
19742017
&best_row.content_text,
@@ -2745,6 +2788,12 @@ impl PostgresDb {
27452788
}
27462789

27472790
const FILE_SAMPLE_FACTOR: u32 = 6;
2791+
const REGEX_FILE_SAMPLE_FACTOR: u32 = 2;
2792+
const DEFAULT_FETCH_LIMIT_CAP: i64 = 5000;
2793+
const REGEX_FETCH_LIMIT_CAP: i64 = 1000;
2794+
const FILE_LIMIT_CAP: i64 = 25000;
2795+
const DEFAULT_PLAN_ROW_LIMIT: i64 = 5000;
2796+
const REGEX_PLAN_ROW_LIMIT: i64 = 1000;
27482797
const INSERT_BATCH_SIZE: usize = 1000;
27492798

27502799
#[derive(sqlx::FromRow)]
@@ -3034,7 +3083,9 @@ fn merge_overlapping_snippets(mut snippets: Vec<SearchSnippet>) -> Vec<SearchSni
30343083
merged
30353084
}
30363085

3037-
fn build_snippet_line_map(snippet: &SearchSnippet) -> BTreeMap<i32, (String, Vec<SearchMatchSpan>)> {
3086+
fn build_snippet_line_map(
3087+
snippet: &SearchSnippet,
3088+
) -> BTreeMap<i32, (String, Vec<SearchMatchSpan>)> {
30383089
let mut map = BTreeMap::new();
30393090
for (idx, (line, spans)) in split_snippet_lines(snippet).into_iter().enumerate() {
30403091
let line_number = snippet.start_line.saturating_add(idx as i32);
@@ -3132,19 +3183,11 @@ mod tests {
31323183
use super::*;
31333184

31343185
fn build_phase1_sql(request: &TextSearchRequest) -> String {
3135-
let page_index = u64::from(request.page);
3136-
let page_size = u64::from(request.page_size.max(1));
3137-
let sample_factor = u64::from(FILE_SAMPLE_FACTOR.max(1));
3138-
let base_limit = page_index
3139-
.saturating_add(1)
3140-
.saturating_mul(page_size)
3141-
.saturating_mul(sample_factor);
3142-
let minimum = page_size.saturating_mul(sample_factor);
3143-
let fetch_limit_u64 = base_limit.max(minimum).saturating_add(1);
3144-
let mut fetch_limit = fetch_limit_u64.min(i64::MAX as u64) as i64;
3145-
fetch_limit = fetch_limit.min(5000);
3146-
let file_limit = fetch_limit.min(25000);
3147-
let plan_row_limit: i64 = 5000;
3186+
let SearchBudgets {
3187+
fetch_limit,
3188+
file_limit,
3189+
plan_row_limit,
3190+
} = compute_search_budgets(request);
31483191

31493192
let needs_live_branch_filter = request
31503193
.plans
@@ -3203,11 +3246,13 @@ mod tests {
32033246
assert_eq!(lines.len(), 5);
32043247
assert_eq!(merged_snippet.match_spans.len(), 2);
32053248
assert_eq!(
3206-
&merged_snippet.content_text[merged_snippet.match_spans[0].start..merged_snippet.match_spans[0].end],
3249+
&merged_snippet.content_text
3250+
[merged_snippet.match_spans[0].start..merged_snippet.match_spans[0].end],
32073251
"hit_a"
32083252
);
32093253
assert_eq!(
3210-
&merged_snippet.content_text[merged_snippet.match_spans[1].start..merged_snippet.match_spans[1].end],
3254+
&merged_snippet.content_text
3255+
[merged_snippet.match_spans[1].start..merged_snippet.match_spans[1].end],
32113256
"hit_b"
32123257
);
32133258
}
@@ -3237,7 +3282,10 @@ mod tests {
32373282
let lines: Vec<&str> = merged_snippet.content_text.split('\n').collect();
32383283
assert_eq!(lines.len(), 5);
32393284
assert_eq!(lines[2], "hit_b");
3240-
assert_eq!(merged_snippet.match_spans, vec![SearchMatchSpan { start: 14, end: 19 }]);
3285+
assert_eq!(
3286+
merged_snippet.match_spans,
3287+
vec![SearchMatchSpan { start: 14, end: 19 }]
3288+
);
32413289
}
32423290

32433291
#[test]
@@ -3261,11 +3309,13 @@ mod tests {
32613309
let merged_snippet = &merged[0];
32623310

32633311
assert_eq!(
3264-
&merged_snippet.content_text[merged_snippet.match_spans[0].start..merged_snippet.match_spans[0].end],
3312+
&merged_snippet.content_text
3313+
[merged_snippet.match_spans[0].start..merged_snippet.match_spans[0].end],
32653314
"failed for block"
32663315
);
32673316
assert_eq!(
3268-
&merged_snippet.content_text[merged_snippet.match_spans[1].start..merged_snippet.match_spans[1].end],
3317+
&merged_snippet.content_text
3318+
[merged_snippet.match_spans[1].start..merged_snippet.match_spans[1].end],
32693319
"write"
32703320
);
32713321
}
@@ -3290,8 +3340,7 @@ mod tests {
32903340
let text = r#"pg_fatal("seek failed for block %u", blockno);"#;
32913341
let original = vec![SearchMatchSpan { start: 17, end: 33 }];
32923342

3293-
let normalized =
3294-
normalize_literal_match_spans(text, &original, "failed for block", true);
3343+
let normalized = normalize_literal_match_spans(text, &original, "failed for block", true);
32953344

32963345
let expected_start = text.find("failed for block").expect("phrase should exist");
32973346
assert_eq!(
@@ -3306,8 +3355,7 @@ mod tests {
33063355
#[test]
33073356
fn normalize_literal_match_spans_preserves_regex_patterns() {
33083357
let original = vec![SearchMatchSpan { start: 5, end: 11 }];
3309-
let normalized =
3310-
normalize_literal_match_spans("abcde failed", &original, "fail.*", true);
3358+
let normalized = normalize_literal_match_spans("abcde failed", &original, "fail.*", true);
33113359
assert_eq!(normalized, original);
33123360
}
33133361

@@ -3325,6 +3373,56 @@ mod tests {
33253373
let sql = build_phase1_sql(&request);
33263374
assert!(!sql.contains("INTERSECT"));
33273375
}
3376+
3377+
#[test]
3378+
fn plain_repo_filtered_search_seeds_from_files() {
3379+
let request = TextSearchRequest::from_query_str("repo:pointer polly").unwrap();
3380+
let sql = build_phase1_sql(&request);
3381+
3382+
assert!(sql.contains("FROM\n files f_seed"));
3383+
assert!(sql.contains("f_seed.repository = ANY("));
3384+
}
3385+
3386+
#[test]
3387+
fn regex_repo_filtered_search_seeds_from_chunks() {
3388+
let request =
3389+
TextSearchRequest::from_query_str("repo:pointer regex:\"unsafe\\\\s*\\\\{\"").unwrap();
3390+
let sql = build_phase1_sql(&request);
3391+
3392+
assert!(sql.contains("FROM\n chunks c"));
3393+
assert!(!sql.contains("f_seed.repository = ANY("));
3394+
assert!(sql.contains("files.repository = ANY("));
3395+
}
3396+
3397+
#[test]
3398+
fn regex_search_uses_smaller_phase1_budgets() {
3399+
let request = TextSearchRequest::from_query_str("regex:\"foo.*bar\"").unwrap();
3400+
let budgets = compute_search_budgets(&request);
3401+
3402+
assert_eq!(
3403+
budgets,
3404+
SearchBudgets {
3405+
fetch_limit: 51,
3406+
file_limit: 51,
3407+
plan_row_limit: REGEX_PLAN_ROW_LIMIT,
3408+
}
3409+
);
3410+
}
3411+
3412+
#[test]
3413+
fn plain_search_keeps_default_phase1_budgets() {
3414+
let request = TextSearchRequest::from_query_str("polly").unwrap();
3415+
let budgets = compute_search_budgets(&request);
3416+
3417+
assert_eq!(
3418+
budgets,
3419+
SearchBudgets {
3420+
fetch_limit: 151,
3421+
file_limit: 151,
3422+
plan_row_limit: DEFAULT_PLAN_ROW_LIMIT,
3423+
}
3424+
);
3425+
}
33283426
}
33293427

33303428
fn build_search_stats(rows: &[RankedFileRow]) -> SearchResultsStats {

src/mcp/server.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,10 @@ async fn mcp_rpc(Json(req): Json<JsonRpcRequest>) -> Response {
210210
211211
Usage guidance:
212212
- Prefer narrow, incremental reads with `file_content` using `start_line` and `end_line`.
213+
- Prefer `all_terms` and `any_terms` over `regex` whenever literal terms can answer the question.
213214
- Use `all_terms` for AND semantics, `any_terms` for OR semantics, and `regex` only for regex content matching.
215+
- Avoid broad regexes with leading `.*`, trailing `.*`, very common terms, or loose wildcards over large scopes.
216+
- When regex is necessary, keep the pattern specific and pair it with restrictive filters such as `repo`, `branch`, `path`, `file`, or `lang`.
214217
- If branch recency or version differences matter, call `repo_branches` first and compare explicit branch results.
215218
216219
Citation requirement:

src/pages/search.rs

Lines changed: 10 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
use crate::db::models::{
2-
FacetCount, SearchMatchSpan, SearchResult, SearchResultsPage, SearchResultsStats,
3-
SearchSnippet,
2+
FacetCount, SearchMatchSpan, SearchResult, SearchResultsPage, SearchResultsStats, SearchSnippet,
43
};
54
use crate::dsl::DEFAULT_PAGE_SIZE;
65
use crate::services::search_service::search;
@@ -976,14 +975,13 @@ mod tests {
976975
let start = input.find("failed for block").expect("phrase should exist");
977976
let end = start + "failed for block".len();
978977

979-
let segments = segment_snippet_by_spans(
980-
input,
981-
&[SearchMatchSpan { start, end }],
982-
);
978+
let segments = segment_snippet_by_spans(input, &[SearchMatchSpan { start, end }]);
983979

984-
assert!(segments.iter().any(|(text, highlighted)| {
985-
*highlighted && text == "failed for block"
986-
}));
980+
assert!(
981+
segments
982+
.iter()
983+
.any(|(text, highlighted)| { *highlighted && text == "failed for block" })
984+
);
987985
}
988986

989987
#[test]
@@ -1003,10 +1001,7 @@ mod tests {
10031001
#[test]
10041002
fn segment_snippet_by_spans_rejects_non_char_boundary_spans() {
10051003
let input = "é failed";
1006-
let segments = segment_snippet_by_spans(
1007-
input,
1008-
&[SearchMatchSpan { start: 1, end: 8 }],
1009-
);
1004+
let segments = segment_snippet_by_spans(input, &[SearchMatchSpan { start: 1, end: 8 }]);
10101005

10111006
assert_eq!(segments, vec![(input.to_string(), false)]);
10121007
}
@@ -1017,17 +1012,11 @@ mod tests {
10171012
let start = input.find("failed").expect("phrase should exist");
10181013
let end = start + "failed".len();
10191014

1020-
let segments = segment_snippet_by_spans(
1021-
input,
1022-
&[SearchMatchSpan { start, end }],
1023-
);
1015+
let segments = segment_snippet_by_spans(input, &[SearchMatchSpan { start, end }]);
10241016

10251017
assert_eq!(
10261018
segments,
1027-
vec![
1028-
("é ".to_string(), false),
1029-
("failed".to_string(), true),
1030-
]
1019+
vec![("é ".to_string(), false), ("failed".to_string(), true),]
10311020
);
10321021
}
10331022
}

0 commit comments

Comments
 (0)