Skip to main content

AirLibrary/Indexing/Scan/
ScanFile.rs

1//! # ScanFile
2//!
3//! ## File: Indexing/Scan/ScanFile.rs
4//!
5//! ## Role in Air Architecture
6//!
7//! Provides individual file scanning functionality for the File Indexer
8//! service, handling reading, metadata extraction, and categorization of files
9//! for indexing.
10//!
11//! ## Primary Responsibility
12//!
13//! Scan individual files to extract metadata, content, and prepare them for
14//! indexing operations.
15//!
16//! ## Secondary Responsibilities
17//!
18//! - File access validation and permission checking
19//! - Encoding detection for text files
20//! - Language detection for code files
21//! - File size validation
22//! - Symbolic link detection
23//!
24//! ## Dependencies
25//!
26//! **External Crates:**
27//! - `tokio` - Async file I/O operations
28//! - `sha2` - Checksum calculation for file integrity
29//!
30//! **Internal Modules:**
31//! - `crate::Result` - Error handling type
32//! - `crate::AirError` - Error types
33//! - `crate::Configuration::IndexingConfig` - Indexing configuration
34//! - `super::super::State::CreateState` - State structure definitions
35//! - `super::Process::ProcessContent` - Content processing operations
36//!
37//! ## Dependents
38//!
39//! - `Indexing::Scan::ScanDirectory` - Batch file processing
40//! - `Indexing::Watch::WatchFile` - Individual file change handling
41//! - `Indexing::mod::FileIndexer` - Main file indexer implementation
42//!
43//! ## VSCode Pattern Reference
44//!
45//! Inspired by VSCode's file scanning in
46//! `src/vs/workbench/services/files/`
47//!
48//! ## Security Considerations
49//!
50//! - Path canonicalization before access
51//! - File size limits enforced
52//! - Timeout protection for I/O operations
53//! - Permission checking before reads
54//!
55//! ## Performance Considerations
56//!
57//! - Asynchronous file reading
58//! - Batch processing operations
59//! - Memory-efficient streaming for large files
60//! - Cached metadata when available
61//!
62//! ## Error Handling Strategy
63//!
64//! File scanning returns Results with detailed error messages about
65//! why a file cannot be scanned or accessed. Errors are logged and
66//! individual file failures don't halt batch operations.
67//!
68//! ## Thread Safety
69//!
70//! File scanning operations are designed for parallel execution and
71use std::{
72	path::PathBuf,
73	time::{Duration, Instant},
74};
75
76/// produce results that can be safely merged into shared state.
77use crate::dev_log;
78use crate::{
79	AirError,
80	Configuration::IndexingConfig,
81	Indexing::{
82		Process::{
83			ExtractSymbols::ExtractSymbols,
84			ProcessContent::{DetectEncoding, DetectLanguage, DetectMimeType},
85		},
86		State::CreateState::{FileMetadata, SymbolInfo},
87	},
88	Result,
89};
90
91/// Index a single file internally with comprehensive validation
92///
93/// This function is called by parallel tasks during directory scanning
94/// and includes:
95/// - File metadata extraction
96/// - Size validation
97/// - SHA-256 checksum calculation
98/// - Encoding detection
99/// - MIME type detection
100/// - Language detection
101/// - Symbol extraction for code files
102pub async fn IndexFileInternal(
103	file_path:&PathBuf,
104
105	config:&IndexingConfig,
106
107	_patterns:&[String],
108) -> Result<(FileMetadata, Vec<SymbolInfo>)> {
109	let start_time = Instant::now();
110
111	// Get file metadata with error handling
112	let metadata = std::fs::metadata(file_path)
113		.map_err(|e| AirError::FileSystem(format!("Failed to get file metadata: {}", e)))?;
114
115	// Get modified time
116	let modified = metadata
117		.modified()
118		.map_err(|e| AirError::FileSystem(format!("Failed to get modification time: {}", e)))?;
119
120	let modified_time = chrono::DateTime::<chrono::Utc>::from(modified);
121
122	// Check if file size exceeds limit
123	let file_size = metadata.len();
124
125	if file_size > config.MaxFileSizeMb as u64 * 1024 * 1024 {
126		return Err(AirError::FileSystem(format!(
127			"File size {} exceeds limit {} MB",
128			file_size, config.MaxFileSizeMb
129		)));
130	}
131
132	// File read with timeout protection
133	let content = tokio::time::timeout(Duration::from_secs(30), tokio::fs::read(file_path))
134		.await
135		.map_err(|_| AirError::FileSystem(format!("Timeout reading file: {} (30s limit)", file_path.display())))?
136		.map_err(|e| AirError::FileSystem(format!("Failed to read file: {}", e)))?;
137
138	// Check for symbolic link
139	let is_symlink = std::fs::symlink_metadata(file_path)
140		.map(|m| m.file_type().is_symlink())
141		.unwrap_or(false);
142
143	// Calculate SHA-256 checksum
144	let checksum = CalculateChecksum(&content);
145
146	// Detect file encoding
147	let encoding = DetectEncoding(&content);
148
149	// Detect MIME type
150	let mime_type = DetectMimeType(file_path, &content);
151
152	// Detect programming language
153	let language = DetectLanguage(file_path);
154
155	// Count lines for text files
156	let line_count = if mime_type.starts_with("text/") {
157		Some(content.iter().filter(|&&b| b == b'\n').count() as u32 + 1)
158	} else {
159		None
160	};
161
162	// Extract symbols from code for VSCode Outline View
163	let symbols = if let Some(lang) = &language {
164		ExtractSymbols(file_path, &content, lang).await?
165	} else {
166		Vec::new()
167	};
168
169	let permissions = GetPermissionsString(&metadata);
170
171	let elapsed = start_time.elapsed();
172
173	dev_log!(
174		"indexing",
175		"indexed {} in {}ms ({} symbols)",
176		file_path.display(),
177		elapsed.as_millis(),
178		symbols.len()
179	);
180
181	Ok((
182		FileMetadata {
183			path:file_path.clone(),
184			size:file_size,
185			modified:modified_time,
186			mime_type,
187			language,
188			line_count,
189			checksum,
190			is_symlink,
191			permissions,
192			encoding,
193			indexed_at:chrono::Utc::now(),
194			symbol_count:symbols.len() as u32,
195		},
196		symbols,
197	))
198}
199
200/// Validate file access and permissions before scanning
201pub async fn ValidateFileAccess(file_path:&PathBuf) -> bool {
202	tokio::task::spawn_blocking({
203		let file_path = file_path.to_path_buf();
204
205		move || {
206			// Try to read file metadata
207			let can_access = std::fs::metadata(&file_path).is_ok();
208
209			if can_access {
210				// Try to open file for reading
211				std::fs::File::open(&file_path).is_ok()
212			} else {
213				false
214			}
215		}
216	})
217	.await
218	.unwrap_or(false)
219}
220
221/// Calculate SHA-256 checksum for file content
222pub fn CalculateChecksum(content:&[u8]) -> String {
223	// sha2 0.11 moved `Digest::finalize()` to `hybrid_array::Array`, which has
224	// no `LowerHex` impl (the old `GenericArray` did). `hex::encode` over the
225	// byte output is the drop-in replacement - same lowercase hex string,
226	// same length. `hex` is already a workspace dependency of Air.
227	use sha2::{Digest, Sha256};
228
229	let mut hasher = Sha256::new();
230
231	hasher.update(content);
232
233	hex::encode(hasher.finalize())
234}
235
236/// Get file permissions as string
237#[cfg(unix)]
238pub fn GetPermissionsString(metadata:&std::fs::Metadata) -> String {
239	use std::os::unix::fs::PermissionsExt;
240
241	let mode = metadata.permissions().mode();
242
243	let mut perms = String::new();
244
245	// Read permission
246	perms.push(if mode & 0o400 != 0 { 'r' } else { '-' });
247
248	// Write permission
249	perms.push(if mode & 0o200 != 0 { 'w' } else { '-' });
250
251	// Execute permission
252	perms.push(if mode & 0o100 != 0 { 'x' } else { '-' });
253
254	// Group permissions
255	perms.push(if mode & 0o040 != 0 { 'r' } else { '-' });
256
257	perms.push(if mode & 0o020 != 0 { 'w' } else { '-' });
258
259	perms.push(if mode & 0o010 != 0 { 'x' } else { '-' });
260
261	// Other permissions
262	perms.push(if mode & 0o004 != 0 { 'r' } else { '-' });
263
264	perms.push(if mode & 0o002 != 0 { 'w' } else { '-' });
265
266	perms.push(if mode & 0o001 != 0 { 'x' } else { '-' });
267
268	perms
269}
270
271/// Get file permissions as string for non-Unix systems
272#[cfg(not(unix))]
273pub fn GetPermissionsString(_metadata:&std::fs::Metadata) -> String { "--------".to_string() }
274
275/// Scan file and return just the metadata (without symbols)
276pub async fn ScanFileMetadata(file_path:&PathBuf) -> Result<FileMetadata> {
277	let metadata = std::fs::metadata(file_path)
278		.map_err(|e| AirError::FileSystem(format!("Failed to get file metadata: {}", e)))?;
279
280	let modified = metadata
281		.modified()
282		.map_err(|e| AirError::FileSystem(format!("Failed to get modification time: {}", e)))?;
283
284	let modified_time = chrono::DateTime::<chrono::Utc>::from(modified);
285
286	Ok(FileMetadata {
287		path:file_path.clone(),
288		size:metadata.len(),
289		modified:modified_time,
290		mime_type:"application/octet-stream".to_string(),
291		language:None,
292		line_count:None,
293		checksum:String::new(),
294		is_symlink:metadata.file_type().is_symlink(),
295		permissions:GetPermissionsString(&metadata),
296		encoding:None,
297		indexed_at:chrono::Utc::now(),
298		symbol_count:0,
299	})
300}
301
302/// Check if file has been modified since last indexed
303pub fn FileModifiedSince(file_path:&PathBuf, last_indexed:chrono::DateTime<chrono::Utc>) -> Result<bool> {
304	let metadata = std::fs::metadata(file_path)
305		.map_err(|e| AirError::FileSystem(format!("Failed to get file metadata: {}", e)))?;
306
307	let modified = metadata
308		.modified()
309		.map_err(|e| AirError::FileSystem(format!("Failed to get modification time: {}", e)))?;
310
311	let modified_time = chrono::DateTime::<chrono::Utc>::from(modified);
312
313	Ok(modified_time > last_indexed)
314}
315
316/// Get file size with error handling
317pub async fn GetFileSize(file_path:&PathBuf) -> Result<u64> {
318	tokio::task::spawn_blocking({
319		let file_path = file_path.to_path_buf();
320
321		move || {
322			let metadata = std::fs::metadata(&file_path)
323				.map_err(|e| AirError::FileSystem(format!("Failed to get file metadata: {}", e)))?;
324
325			Ok(metadata.len())
326		}
327	})
328	.await?
329}
330
331/// Check if file is text-based (likely to be code or documentation)
332pub fn IsTextFile(metadata:&FileMetadata) -> bool {
333	metadata.mime_type.starts_with("text/")
334		|| metadata.mime_type.contains("json")
335		|| metadata.mime_type.contains("xml")
336		|| metadata.mime_type.contains("yaml")
337		|| metadata.mime_type.contains("toml")
338		|| metadata.language.is_some()
339}
340
341/// Check if file is binary (not suitable for indexing)
342pub fn IsBinaryFile(metadata:&FileMetadata) -> bool {
343	!IsTextFile(metadata)
344		|| metadata.mime_type == "application/octet-stream"
345		|| metadata.mime_type == "application/zip"
346		|| metadata.mime_type == "application/x-tar"
347		|| metadata.mime_type == "application/x-gzip"
348		|| metadata.mime_type == "application/x-bzip2"
349}