AirLibrary/Indexing/Scan/
ScanDirectory.rs

1//! # ScanDirectory
2//!
3//! ## File: Indexing/Scan/ScanDirectory.rs
4//!
5//! ## Role in Air Architecture
6//!
7//! Provides directory scanning functionality for the File Indexer service,
8//! handling recursive traversal of directories to discover files for indexing.
9//!
10//! ## Primary Responsibility
11//!
12//! Scan directories recursively to discover files matching include patterns
13//! while respecting exclude patterns and filesystem limits.
14//!
15//! ## Secondary Responsibilities
16//!
17//! - Validate directory permissions before scanning
18//! - Parallel file enumeration for performance
19//! - Skip directories like node_modules, target, .git
20//! - Collect files with metadata for batch processing
21//!
22//! ## Dependencies
23//!
24//! **External Crates:**
25//! - `ignore` - .gitignore-aware directory walking
26//! - `tokio` - Async runtime for I/O operations
27//!
28//! **Internal Modules:**
29//! - `crate::Result` - Error handling type
30//! - `crate::AirError` - Error types
31//! - `crate::Configuration::IndexingConfig` - Indexing configuration
32//!
33//! ## Dependents
34//!
35//! - `Indexing::mod::FileIndexer` - Main file indexer implementation
36//! - `Indexing::Background::StartWatcher` - Background task scanning
37//!
38//! ## VSCode Pattern Reference
39//!
40//! Inspired by VSCode's file system scanning in
41//! `src/vs/base/common/files/`
42//!
43//! ## Security Considerations
44//!
45//! - Path traversal protection through canonicalization
46//! - Symbolic link following disabled by default
47//! - Depth limits prevent infinite recursion
48//! - Permission checking before access
49//!
50//! ## Performance Considerations
51//!
52//! - Parallel directory scanning with limited concurrency
53//! - Batch collection of files for processing
54//! - Lazy evaluation with ignore crate
55//! - Early filtering by file patterns
56//!
57//! ## Error Handling Strategy
58//!
59//! Scan operations log warnings for individual errors and continue,
60//! returning a result only if the top-level operation fails.
61//!
62//! ## Thread Safety
63//!
64//! Scan operations are designed to be called from async tasks and
65//! return collectable results for parallel processing.
66
67use std::{collections::HashSet, path::Path, sync::Arc};
68
69use tokio::sync::{RwLock, Semaphore};
70
71use crate::{
72	AirError,
73	Configuration::IndexingConfig,
74	Indexing::{
75		Scan::ScanFile::{IndexFileInternal, ValidateFileAccess},
76		State::CreateState::{FileIndex, FileMetadata, SymbolInfo, SymbolLocation},
77	},
78	Result,
79};
80
81/// Scan directory result with statistics
82#[derive(Debug, Clone)]
83pub struct ScanDirectoryResult {
84	/// Number of files discovered
85	pub files_found:u32,
86	/// Number of files skipped (due to patterns/size)
87	pub files_skipped:u32,
88	/// Number of errors encountered
89	pub errors:u32,
90	/// Total size of discovered files in bytes
91	pub total_size:u64,
92}
93
94/// Scan a directory recursively and collect matching files
95///
96/// Features:
97/// - Path traversal protection
98/// - Symbolic link handling (disabled by default)
99/// - File size validation
100/// - Permission error handling
101/// - Include/exclude pattern support
102/// - Parallel scanning with semaphore limits
103pub async fn ScanDirectory(
104	path:&str,
105	patterns:Vec<String>,
106	config:&IndexingConfig,
107	max_parallel:usize,
108) -> Result<(Vec<std::path::PathBuf>, ScanDirectoryResult)> {
109	let directory_path = crate::Configuration::ConfigurationManager::ExpandPath(path)?;
110
111	// Validate directory exists and is accessible
112	if !directory_path.exists() {
113		return Err(AirError::FileSystem(format!("Directory does not exist: {}", path)));
114	}
115
116	if !directory_path.is_dir() {
117		return Err(AirError::FileSystem(format!("Path is not a directory: {}", path)));
118	}
119
120	// Check directory permissions
121	CheckDirectoryPermissions(&directory_path).await?;
122
123	// Build file patterns
124	let include_patterns = if patterns.is_empty() { config.FileTypes.clone() } else { patterns };
125
126	// Walk directory with .gitignore support
127	let walker = ignore::WalkBuilder::new(&directory_path)
128		.max_depth(Some(10)) // Prevent infinite recursion
129		.hidden(false)
130		.follow_links(false) // Don't follow symlinks by default
131		.build();
132
133	let mut files_to_scan:Vec<std::path::PathBuf> = Vec::new();
134	let mut files_found = 0u32;
135	let mut files_skipped = 0u32;
136	let mut errors = 0u32;
137	let mut total_size = 0u64;
138
139	// Collect all files first
140	for result in walker {
141		match result {
142			Ok(entry) => {
143				// Only index regular files (not directories or symlinks)
144				if entry.file_type().map(|ft| ft.is_file()).unwrap_or(false) {
145					let file_path = entry.path().to_path_buf();
146
147					// Check if file is a symbolic link
148					if entry.path_is_symlink() {
149						log::debug!("[ScanDirectory] Skipping symlink: {}", file_path.display());
150						files_skipped += 1;
151						continue;
152					}
153
154					// Check file size limit
155					if let Ok(metadata) = entry.metadata() {
156						let file_size = metadata.len();
157
158						if file_size > config.MaxFileSizeMb as u64 * 1024 * 1024 {
159							log::warn!(
160								"[ScanDirectory] Skipping oversized file: {} ({} bytes)",
161								file_path.display(),
162								file_size
163							);
164							files_skipped += 1;
165							continue;
166						}
167
168						// Check file pattern
169						if MatchesPatterns(&file_path, &include_patterns) {
170							// Try to get file access to validate permissions
171							if ValidateFileAccess(&file_path).await {
172								files_to_scan.push(file_path);
173								files_found += 1;
174								total_size += file_size;
175							} else {
176								log::warn!(
177									"[ScanDirectory] Cannot access file (permission denied): {}",
178									file_path.display()
179								);
180								errors += 1;
181							}
182						} else {
183							files_skipped += 1;
184						}
185					} else {
186						errors += 1;
187					}
188				}
189			},
190			Err(e) => {
191				log::warn!("[ScanDirectory] Error walking directory: {}", e);
192				errors += 1;
193			},
194		}
195	}
196
197	log::info!(
198		"[ScanDirectory] Directory scan completed: {} files, {} skipped, {} errors, {} bytes",
199		files_found,
200		files_skipped,
201		errors,
202		total_size
203	);
204
205	Ok((
206		files_to_scan,
207		ScanDirectoryResult { files_found, files_skipped, errors, total_size },
208	))
209}
210
211/// Scan a directory and remove deleted files from index
212pub async fn ScanAndRemoveDeleted(index:&mut FileIndex, directory_path:&Path) -> Result<u32> {
213	let mut paths_to_remove = Vec::new();
214	let all_paths:Vec<_> = index.files.keys().cloned().collect();
215
216	for path in all_paths {
217		if !path.exists() && path.starts_with(directory_path) {
218			paths_to_remove.push(path.clone());
219		}
220	}
221
222	let removed_count = paths_to_remove.len();
223	for path in paths_to_remove {
224		index.files.remove(&path);
225		index.file_symbols.remove(&path);
226
227		// Remove from symbol index
228		for (_, locations) in index.symbol_index.iter_mut() {
229			locations.retain(|loc| loc.file_path != path);
230		}
231
232		// Remove from content index
233		for (_, files) in index.content_index.iter_mut() {
234			files.retain(|p| p != &path);
235		}
236	}
237
238	Ok(removed_count as u32)
239}
240
241/// Check directory read permissions
242async fn CheckDirectoryPermissions(path:&Path) -> Result<()> {
243	tokio::task::spawn_blocking({
244		let path = path.to_path_buf();
245		move || {
246			std::fs::read_dir(&path)
247				.map_err(|e| AirError::FileSystem(format!("Cannot read directory {}: {}", path.display(), e)))?;
248			Ok(())
249		}
250	})
251	.await?
252}
253
254/// Check if file path matches any of the provided patterns
255pub fn MatchesPatterns(file_path:&std::path::Path, patterns:&[String]) -> bool {
256	if patterns.is_empty() {
257		return true;
258	}
259
260	let file_name = file_path.file_name().unwrap_or_default().to_string_lossy().to_string();
261
262	for pattern in patterns {
263		if MatchesPattern(&file_name, pattern) {
264			return true;
265		}
266	}
267
268	false
269}
270
271/// Check if filename matches a single pattern
272pub fn MatchesPattern(filename:&str, pattern:&str) -> bool {
273	if pattern.starts_with("*.") {
274		let extension = &pattern[2..];
275		filename.ends_with(extension)
276	} else {
277		filename == pattern
278	}
279}
280
281/// Get default exclude patterns for directory scanning
282pub fn GetDefaultExcludePatterns() -> Vec<String> {
283	vec![
284		"node_modules".to_string(),
285		"target".to_string(),
286		".git".to_string(),
287		".svn".to_string(),
288		".hg".to_string(),
289		".bzr".to_string(),
290		"dist".to_string(),
291		"build".to_string(),
292		".next".to_string(),
293		".nuxt".to_string(),
294		"__pycache__".to_string(),
295		"*.pyc".to_string(),
296		".venv".to_string(),
297		"venv".to_string(),
298		"env".to_string(),
299		".env".to_string(),
300		".idea".to_string(),
301		".vscode".to_string(),
302		".DS_Store".to_string(),
303		"Thumbs.db".to_string(),
304	]
305}
306
307/// Parallel scan of multiple directories
308pub async fn ScanDirectoriesParallel(
309	directories:Vec<String>,
310	patterns:Vec<String>,
311	config:&IndexingConfig,
312	max_parallel:usize,
313) -> Result<(Vec<std::path::PathBuf>, ScanDirectoryResult)> {
314	let semaphore = Arc::new(Semaphore::new(max_parallel));
315	let mut all_files = Vec::new();
316	let mut total_result = ScanDirectoryResult { files_found:0, files_skipped:0, errors:0, total_size:0 };
317
318	let mut scan_tasks = Vec::new();
319
320	for directory in directories {
321		let permit = semaphore.clone().acquire_owned().await.unwrap();
322		let config_clone = config.clone();
323		let patterns_clone = patterns.clone();
324
325		let task = tokio::spawn(async move {
326			let _permit = permit;
327			ScanDirectory(&directory, patterns_clone, &config_clone, max_parallel).await
328		});
329
330		scan_tasks.push(task);
331	}
332
333	// Collect results
334	for task in scan_tasks {
335		match task.await {
336			Ok(Ok((files, result))) => {
337				all_files.extend(files);
338				total_result.files_found += result.files_found;
339				total_result.files_skipped += result.files_skipped;
340				total_result.errors += result.errors;
341				total_result.total_size += result.total_size;
342			},
343			Ok(Err(e)) => {
344				log::error!("[ScanDirectory] Parallel scan failed: {}", e);
345				total_result.errors += 1;
346			},
347			Err(e) => {
348				log::error!("[ScanDirectory] Parallel task panicked: {}", e);
349				total_result.errors += 1;
350			},
351		}
352	}
353
354	Ok((all_files, total_result))
355}
356
357/// Get file count statistics for a directory without full scan
358pub async fn GetDirectoryStatistics(path:&str, max_depth:Option<usize>) -> Result<DirectoryStatistics> {
359	let directory_path = crate::Configuration::ConfigurationManager::ExpandPath(path)?;
360
361	if !directory_path.exists() || !directory_path.is_dir() {
362		return Err(AirError::FileSystem(format!("Invalid directory: {}", path)));
363	}
364
365	let mut file_count = 0u64;
366	let mut total_size = 0u64;
367	let mut directory_count = 0u64;
368	let mut hidden_count = 0u64;
369
370	let walker = ignore::WalkBuilder::new(&directory_path)
371		.max_depth(max_depth)
372		.hidden(true)
373		.follow_links(false)
374		.build();
375
376	for entry in walker.flatten() {
377		let file_type = entry.file_type().expect("Failed to get file type");
378
379		if file_type.is_file() {
380			file_count += 1;
381			if let Ok(metadata) = entry.metadata() {
382				total_size += metadata.len();
383			}
384		} else if file_type.is_dir() {
385			directory_count += 1;
386		}
387
388		if entry.depth() > 0
389			&& entry
390				.path()
391				.components()
392				.any(|c| c.as_os_str().to_string_lossy().starts_with('.'))
393		{
394			hidden_count += 1;
395		}
396	}
397
398	Ok(DirectoryStatistics { file_count, directory_count, hidden_count, total_size })
399}
400
401/// Directory statistics
402#[derive(Debug, Clone)]
403pub struct DirectoryStatistics {
404	pub file_count:u64,
405	pub directory_count:u64,
406	pub hidden_count:u64,
407	pub total_size:u64,
408}