AirLibrary/Indexing/Store/
QueryIndex.rs

1//! # QueryIndex
2//!
3//! ## File: Indexing/Store/QueryIndex.rs
4//!
5//! ## Role in Air Architecture
6//!
7//! Provides index query functionality for the File Indexer service,
8//! handling search operations across indexed files with multiple search modes.
9//!
10//! ## Primary Responsibility
11//!
12//! Query the file index to find symbols and content matching search criteria,
13//! supporting literal, regex, fuzzy, and exact search modes.
14//!
15//! ## Secondary Responsibilities
16//!
17//! - Multi-mode search (literal, regex, fuzzy, exact)
18//! - Case sensitivity and whole word matching
19//! - Path and language filtering
20//! - Result pagination and ranking
21//! - Search query sanitization
22//!
23//! ## Dependencies
24//!
25//! **External Crates:**
26//! - `regex` - Regular expression search patterns
27//! - `tokio` - Async file I/O operations
28//!
29//! **Internal Modules:**
30//! - `crate::Result` - Error handling type
31//! - `crate::AirError` - Error types
32//! - `super::super::FileIndex` - Index structure definitions
33//!
34//! ## Dependents
35//!
36//! - `Indexing::mod::FileIndexer` - Main file indexer implementation
37//!
38//! ## VSCode Pattern Reference
39//!
40//! Inspired by VSCode's search functionality in
41//! `src/vs/workbench/services/search/common/`
42//!
43//! ## Security Considerations
44//!
45//! - Search query sanitization prevents injection
46//! - Query length limits
47//! - Control character filtering
48//!
49//! ## Performance Considerations
50//!
51//! - Content index for fast token lookup
52//! - Result pagination to limit memory usage
53//! - Efficient string matching algorithms
54//! - Fuzzy search with configurable distance
55//!
56//! ## Error Handling Strategy
57//!
58//! Query operations return detailed error messages for invalid queries
59//! or search failures, treating individual file read errors as warnings.
60//!
61//! ## Thread Safety
62//!
63//! Query operations read from shared Arc<RwLock<>> state and
64//! return safe-ownership results for the caller.
65
66use std::{collections::HashMap, path::PathBuf};
67
68use regex::Regex;
69use tokio::sync::RwLock;
70
71use crate::{AirError, Result};
72// Use the full paths to types in State::CreateState
73use crate::Indexing::State::CreateState::{FileIndex, FileMetadata, SymbolInfo, SymbolKind};
74
75/// Maximum search results per query (pagination default)
76pub const MAX_SEARCH_RESULTS_DEFAULT:u32 = 100;
77
78/// Search query with multiple modes
79#[derive(Debug, Clone)]
80pub struct SearchQuery {
81	/// Search text
82	pub query:String,
83	/// Query mode (regex, literal, fuzzy)
84	pub mode:SearchMode,
85	/// Case sensitive search
86	pub case_sensitive:bool,
87	/// Exact word match
88	pub whole_word:bool,
89	/// Regex pattern (only for regex mode)
90	pub regex:Option<Regex>,
91	/// Maximum results per page
92	pub max_results:u32,
93	/// Page number for pagination
94	pub page:u32,
95}
96
97/// Search mode
98#[derive(Debug, Clone, PartialEq)]
99pub enum SearchMode {
100	/// Literal text search
101	Literal,
102	/// Regular expression search
103	Regex,
104	/// Fuzzy search with typo tolerance
105	Fuzzy,
106	/// Exact match
107	Exact,
108}
109
110/// Search result with relevance scoring
111#[derive(Debug, Clone)]
112pub struct SearchResult {
113	/// File path
114	pub path:String,
115	/// File name
116	pub file_name:String,
117	/// Matched lines with context
118	pub matches:Vec<SearchMatch>,
119	/// Relevance score (higher = more relevant)
120	pub relevance:f64,
121	/// Matched language (if applicable)
122	pub language:Option<String>,
123}
124
125/// Search match with full context
126#[derive(Debug, Clone)]
127pub struct SearchMatch {
128	/// Line number (1-indexed)
129	pub line_number:u32,
130	/// Line content
131	pub line_content:String,
132	/// Match start position
133	pub match_start:usize,
134	/// Match end position
135	pub match_end:usize,
136	/// Lines before match for context
137	pub context_before:Vec<String>,
138	/// Lines after match for context
139	pub context_after:Vec<String>,
140}
141
142/// Paginated search results
143#[derive(Debug, Clone)]
144pub struct PaginatedSearchResults {
145	/// Current page of results
146	pub results:Vec<SearchResult>,
147	/// Total number of results (across all pages)
148	pub total_count:u32,
149	/// Current page number (0-indexed)
150	pub page:u32,
151	/// Number of pages
152	pub total_pages:u32,
153	/// Results per page
154	pub page_size:u32,
155}
156
157impl IntoIterator for PaginatedSearchResults {
158	type Item = SearchResult;
159	type IntoIter = std::vec::IntoIter<SearchResult>;
160
161	fn into_iter(self) -> Self::IntoIter { self.results.into_iter() }
162}
163
164impl<'a> IntoIterator for &'a PaginatedSearchResults {
165	type Item = &'a SearchResult;
166	type IntoIter = std::slice::Iter<'a, SearchResult>;
167
168	fn into_iter(self) -> Self::IntoIter { self.results.iter() }
169}
170
171/// Search files with multiple modes and comprehensive query handling
172///
173/// Features:
174/// - Sanitized search query
175/// - Multiple search modes (literal, regex, fuzzy, exact)
176/// - Case sensitivity option
177/// - Whole word matching
178/// - Path filtering
179/// - Result pagination
180/// - Relevance-based ranking
181/// - Language filtering
182pub async fn QueryIndexSearch(
183	index:&FileIndex,
184	query:SearchQuery,
185	path_filter:Option<String>,
186	language_filter:Option<String>,
187) -> Result<PaginatedSearchResults> {
188	log::info!("[QueryIndex] Searching for: '{}' (mode: {:?})", query.query, query.mode);
189
190	// Sanitize search query
191	let sanitized_query = SanitizeSearchQuery(&query.query)?;
192
193	// Build search parameters
194	let case_sensitive = query.case_sensitive;
195	let whole_word = query.whole_word;
196	let max_results = if query.max_results == 0 {
197		MAX_SEARCH_RESULTS_DEFAULT
198	} else {
199		query.max_results.min(1000) // Cap at 1000 results
200	};
201
202	let mut all_results = Vec::new();
203
204	// Search based on mode
205	match query.mode {
206		SearchMode::Literal => {
207			QueryIndexLiteral(
208				&sanitized_query,
209				case_sensitive,
210				whole_word,
211				path_filter.as_deref(),
212				language_filter.as_deref(),
213				index,
214				&mut all_results,
215			)
216			.await;
217		},
218		SearchMode::Regex => {
219			if let Some(regex) = &query.regex {
220				QueryIndexRegex(
221					regex,
222					path_filter.as_deref(),
223					language_filter.as_deref(),
224					index,
225					&mut all_results,
226				)
227				.await;
228			} else {
229				// Try to compile regex from query
230				if let Ok(regex) = Regex::new(&sanitized_query) {
231					QueryIndexRegex(
232						&regex,
233						path_filter.as_deref(),
234						language_filter.as_deref(),
235						index,
236						&mut all_results,
237					)
238					.await;
239				}
240			}
241		},
242		SearchMode::Fuzzy => {
243			QueryIndexFuzzy(
244				&sanitized_query,
245				case_sensitive,
246				path_filter.as_deref(),
247				language_filter.as_deref(),
248				index,
249				&mut all_results,
250			)
251			.await;
252		},
253		SearchMode::Exact => {
254			QueryIndexExact(
255				&sanitized_query,
256				case_sensitive,
257				path_filter.as_deref(),
258				language_filter.as_deref(),
259				index,
260				&mut all_results,
261			)
262			.await;
263		},
264	}
265
266	// Rank results by relevance
267	all_results.sort_by(|a, b| b.relevance.partial_cmp(&a.relevance).unwrap());
268
269	// Calculate pagination
270	let total_count = all_results.len() as u32;
271	let total_pages = if max_results == 0 { 0 } else { total_count.div_ceil(max_results) };
272	let page = query.page.min(total_pages.saturating_sub(1));
273
274	// Extract current page
275	let start = (page * max_results) as usize;
276	let end = ((page + 1) * max_results).min(total_count) as usize;
277	let page_results = all_results[start..end].to_vec();
278
279	log::info!(
280		"[QueryIndex] Search completed: {} total results, page {} of {}",
281		total_count,
282		page + 1,
283		total_pages
284	);
285
286	Ok(PaginatedSearchResults { results:page_results, total_count, page, total_pages, page_size:max_results })
287}
288
289/// Sanitize search query to prevent injection and invalid patterns
290pub fn SanitizeSearchQuery(query:&str) -> Result<String> {
291	// Remove null bytes and control characters
292	let sanitized:String = query.chars().filter(|c| *c != '\0' && !c.is_control()).collect();
293
294	// Limit query length
295	if sanitized.len() > 1000 {
296		return Err(AirError::Validation(
297			"Search query exceeds maximum length of 1000 characters".to_string(),
298		));
299	}
300
301	Ok(sanitized)
302}
303
304/// Literal search (default mode)
305async fn QueryIndexLiteral(
306	query:&str,
307	case_sensitive:bool,
308	whole_word:bool,
309	path_filter:Option<&str>,
310	language_filter:Option<&str>,
311	index:&FileIndex,
312	results:&mut Vec<SearchResult>,
313) {
314	let search_query = if case_sensitive { query.to_string() } else { query.to_lowercase() };
315
316	// Search in content index first (faster)
317	if let Some(file_paths) = index.content_index.get(&search_query.to_lowercase()) {
318		for file_path in file_paths {
319			if let Some(metadata) = index.files.get(file_path) {
320				if MatchesFilters(file_path, metadata, path_filter, language_filter) {
321					if let Ok(search_result) =
322						FindMatchesInFile(file_path, &search_query, case_sensitive, whole_word, index).await
323					{
324						if !search_result.matches.is_empty() {
325							results.push(search_result);
326						}
327					}
328				}
329			}
330		}
331	}
332
333	// Also search in file names
334	for (file_path, metadata) in &index.files {
335		if results.len() >= 1000 {
336			break;
337		}
338
339		let file_name = file_path.file_name().unwrap_or_default().to_string_lossy().to_string();
340
341		let name_to_search = if case_sensitive { file_name.clone() } else { file_name.to_lowercase() };
342
343		if name_to_search.contains(&search_query) {
344			if MatchesFilters(file_path, metadata, path_filter, language_filter) {
345				// Filename match has lower relevance than content match
346				results.push(SearchResult {
347					path:file_path.to_string_lossy().to_string(),
348					file_name,
349					matches:Vec::new(),
350					relevance:0.3,
351					language:metadata.language.clone(),
352				});
353			}
354		}
355	}
356}
357
358/// Regex search mode
359async fn QueryIndexRegex(
360	regex:&Regex,
361	path_filter:Option<&str>,
362	language_filter:Option<&str>,
363	index:&FileIndex,
364	results:&mut Vec<SearchResult>,
365) {
366	for (file_path, metadata) in &index.files {
367		if results.len() >= 1000 {
368			break;
369		}
370
371		if !MatchesFilters(file_path, metadata, path_filter, language_filter) {
372			continue;
373		}
374
375		if let Ok(content) = tokio::fs::read_to_string(file_path).await {
376			let matches = FindRegexMatches(&content, regex);
377
378			if !matches.is_empty() {
379				let relevance = CalculateRelevance(&matches, metadata);
380
381				results.push(SearchResult {
382					path:file_path.to_string_lossy().to_string(),
383					file_name:file_path.file_name().unwrap_or_default().to_string_lossy().to_string(),
384					matches,
385					relevance,
386					language:metadata.language.clone(),
387				});
388			}
389		}
390	}
391}
392
393/// Fuzzy search with typo tolerance using Levenshtein distance
394async fn QueryIndexFuzzy(
395	query:&str,
396	case_sensitive:bool,
397	path_filter:Option<&str>,
398	language_filter:Option<&str>,
399	index:&FileIndex,
400	results:&mut Vec<SearchResult>,
401) {
402	let query_lower = query.to_lowercase();
403
404	for (file_path, metadata) in &index.files {
405		if results.len() >= 1000 {
406			break;
407		}
408
409		if !MatchesFilters(file_path, metadata, path_filter, language_filter) {
410			continue;
411		}
412
413		if let Ok(content) = tokio::fs::read_to_string(file_path).await {
414			let max_distance = 2; // TODO: Make this configurable
415			let matches = FindFuzzyMatches(&content, &query_lower, case_sensitive, max_distance);
416
417			if !matches.is_empty() {
418				let relevance = CalculateRelevance(&matches, metadata) * 0.8; // Fuzzy matches have lower relevance
419
420				results.push(SearchResult {
421					path:file_path.to_string_lossy().to_string(),
422					file_name:file_path.file_name().unwrap_or_default().to_string_lossy().to_string(),
423					matches,
424					relevance,
425					language:metadata.language.clone(),
426				});
427			}
428		}
429	}
430}
431
432/// Exact match search (whole word, case-sensitive)
433async fn QueryIndexExact(
434	query:&str,
435	_case_sensitive:bool,
436	path_filter:Option<&str>,
437	language_filter:Option<&str>,
438	index:&FileIndex,
439	results:&mut Vec<SearchResult>,
440) {
441	for (file_path, metadata) in &index.files {
442		if results.len() >= 1000 {
443			break;
444		}
445
446		if !MatchesFilters(file_path, metadata, path_filter, language_filter) {
447			continue;
448		}
449
450		if let Ok(content) = tokio::fs::read_to_string(file_path).await {
451			let matches = FindExactMatches(&content, query);
452
453			if !matches.is_empty() {
454				let relevance = CalculateRelevance(&matches, metadata) * 1.1; // Exact matches have higher relevance
455
456				results.push(SearchResult {
457					path:file_path.to_string_lossy().to_string(),
458					file_name:file_path.file_name().unwrap_or_default().to_string_lossy().to_string(),
459					matches,
460					relevance,
461					language:metadata.language.clone(),
462				});
463			}
464		}
465	}
466}
467
468/// Find matches in a single file with context
469async fn FindMatchesInFile(
470	file_path:&PathBuf,
471	query:&str,
472	case_sensitive:bool,
473	whole_word:bool,
474	index:&FileIndex,
475) -> Result<SearchResult> {
476	let content = tokio::fs::read_to_string(file_path)
477		.await
478		.map_err(|e| AirError::FileSystem(format!("Failed to read file: {}", e)))?;
479
480	let metadata = index
481		.files
482		.get(file_path)
483		.ok_or_else(|| AirError::Internal("File metadata not found in index".to_string()))?;
484
485	let matches = FindMatchesWithContext(&content, query, case_sensitive, whole_word);
486	let relevance = CalculateRelevance(&matches, metadata);
487
488	Ok(SearchResult {
489		path:file_path.to_string_lossy().to_string(),
490		file_name:file_path.file_name().unwrap_or_default().to_string_lossy().to_string(),
491		matches,
492		relevance,
493		language:metadata.language.clone(),
494	})
495}
496
497/// Find matches in content with surrounding context
498fn FindMatchesWithContext(content:&str, query:&str, case_sensitive:bool, whole_word:bool) -> Vec<SearchMatch> {
499	let mut matches = Vec::new();
500	let lines:Vec<&str> = content.lines().collect();
501
502	let search_in = |line:&str| -> Option<(usize, usize)> {
503		let line_to_search = if case_sensitive { line.to_string() } else { line.to_lowercase() };
504
505		let query_to_find = if case_sensitive { query.to_string() } else { query.to_lowercase() };
506
507		let start = if whole_word {
508			FindWholeWordMatch(&line_to_search, &query_to_find)
509		} else {
510			line_to_search.find(&query_to_find)
511		};
512
513		start.map(|s| (s, s + query.len()))
514	};
515
516	for (line_idx, line) in lines.iter().enumerate() {
517		let line_number = line_idx as u32 + 1;
518
519		if let Some((match_start, match_end)) = search_in(line) {
520			// Get context lines (2 before, 2 after)
521			let context_start = line_idx.saturating_sub(2);
522			let context_end = (line_idx + 3).min(lines.len());
523
524			let context_before = lines[context_start..line_idx].iter().map(|s| s.to_string()).collect();
525
526			let context_after = lines[line_idx + 1..context_end].iter().map(|s| s.to_string()).collect();
527
528			matches.push(SearchMatch {
529				line_number,
530				line_content:line.to_string(),
531				match_start,
532				match_end,
533				context_before,
534				context_after,
535			});
536		}
537	}
538
539	matches
540}
541
542/// Find whole word match with word boundary detection
543fn FindWholeWordMatch(line:&str, word:&str) -> Option<usize> {
544	let mut start = 0;
545
546	while let Some(pos) = line[start..].find(word) {
547		let actual_pos = start + pos;
548
549		// Check word boundary before
550		let valid_before = actual_pos == 0
551			|| line
552				.chars()
553				.nth(actual_pos - 1)
554				.map_or(true, |c| !c.is_alphanumeric() && c != '_');
555
556		// Check word boundary after
557		let match_end = actual_pos + word.len();
558		let valid_after =
559			match_end == line.len() || line.chars().nth(match_end).map_or(true, |c| !c.is_alphanumeric() && c != '_');
560
561		if valid_before && valid_after {
562			return Some(actual_pos);
563		}
564
565		start = actual_pos + 1;
566	}
567
568	None
569}
570
571/// Find regex matches in content
572fn FindRegexMatches(content:&str, regex:&Regex) -> Vec<SearchMatch> {
573	let mut matches = Vec::new();
574	let lines:Vec<&str> = content.lines().collect();
575
576	for (line_idx, line) in lines.iter().enumerate() {
577		let line_number = line_idx as u32 + 1;
578
579		for mat in regex.find_iter(line) {
580			matches.push(SearchMatch {
581				line_number,
582				line_content:line.to_string(),
583				match_start:mat.start(),
584				match_end:mat.end(),
585				context_before:Vec::new(),
586				context_after:Vec::new(),
587			});
588		}
589	}
590
591	matches
592}
593
594/// Find fuzzy matches using Levenshtein distance algorithm
595fn FindFuzzyMatches(content:&str, query:&str, case_sensitive:bool, max_distance:usize) -> Vec<SearchMatch> {
596	let mut matches = Vec::new();
597	let lines:Vec<&str> = content.lines().collect();
598
599	for (line_idx, line) in lines.iter().enumerate() {
600		let line_number = line_idx as u32 + 1;
601		let line_to_search = if case_sensitive { line.to_string() } else { line.to_lowercase() };
602
603		// Calculate Levenshtein distance for fuzzy matching
604		if let Some(pos) = line_to_search.find(query) {
605			// Check if the match is within the MaxDistance threshold
606			let distance = CalculateLevenshteinDistance(&line_to_search[pos..pos.saturating_add(query.len())], query);
607
608			if distance <= max_distance {
609				matches.push(SearchMatch {
610					line_number,
611					line_content:line.to_string(),
612					match_start:pos,
613					match_end:pos + query.len(),
614					context_before:Vec::new(),
615					context_after:Vec::new(),
616				});
617			}
618		}
619	}
620
621	matches
622}
623
624/// Find exact matches (word boundary and case-sensitive)
625fn FindExactMatches(content:&str, query:&str) -> Vec<SearchMatch> { FindMatchesWithContext(content, query, true, true) }
626
627/// Calculate Levenshtein distance between two strings
628fn CalculateLevenshteinDistance(s1:&str, s2:&str) -> usize {
629	let s1_chars:Vec<char> = s1.chars().collect();
630	let s2_chars:Vec<char> = s2.chars().collect();
631	let len1 = s1_chars.len();
632	let len2 = s2_chars.len();
633
634	// Create a 2D matrix to store distances
635	let mut dp = vec![vec![0usize; len2 + 1]; len1 + 1];
636
637	// Initialize the matrix
638	for i in 0..=len1 {
639		dp[i][0] = i;
640	}
641	for j in 0..=len2 {
642		dp[0][j] = j;
643	}
644
645	// Calculate distances
646	for i in 1..=len1 {
647		for j in 1..=len2 {
648			if s1_chars[i - 1] == s2_chars[j - 1] {
649				dp[i][j] = dp[i - 1][j - 1];
650			} else {
651				dp[i][j] = 1 + [
652					dp[i - 1][j],     // deletion
653					dp[i][j - 1],     // insertion
654					dp[i - 1][j - 1], // substitution
655				]
656				.into_iter()
657				.min()
658				.unwrap();
659			}
660		}
661	}
662
663	dp[len1][len2]
664}
665
666/// Calculate relevance score for search results
667fn CalculateRelevance(matches:&[SearchMatch], metadata:&FileMetadata) -> f64 {
668	let match_count = matches.len();
669	let line_count = metadata.line_count.unwrap_or(1) as f64;
670
671	// Base relevance: ratio of matching lines to total lines
672	let mut relevance = (match_count as f64 / line_count) * 10.0;
673
674	// Bonus for more matches
675	if match_count > 0 {
676		relevance += (match_count as f64).log10() * 0.5;
677	}
678
679	// Bonus for recently modified files
680	let days_old = (chrono::Utc::now() - metadata.modified).num_days() as f64;
681	relevance += 1.0 / (days_old + 1.0).max(1.0);
682
683	relevance.min(10.0).max(0.0)
684}
685
686/// Check if file matches filters
687pub fn MatchesFilters(
688	file_path:&PathBuf,
689	metadata:&FileMetadata,
690	path_filter:Option<&str>,
691	language_filter:Option<&str>,
692) -> bool {
693	// Check path filter
694	if let Some(search_path) = path_filter {
695		if !file_path.to_string_lossy().contains(search_path) {
696			return false;
697		}
698	}
699
700	// Check language filter
701	if let Some(lang) = language_filter {
702		if metadata.language.as_deref() != Some(lang) {
703			return false;
704		}
705	}
706
707	true
708}