AirLibrary/Indexing/Process/
ProcessContent.rs

1//! # ProcessContent
2//!
3//! ## File: Indexing/Process/ProcessContent.rs
4//!
5//! ## Role in Air Architecture
6//!
7//! Provides content processing functionality for the File Indexer service,
8//! handling encoding detection, MIME type detection, and content tokenization.
9//!
10//! ## Primary Responsibility
11//!
12//! Process file content for indexing by detecting encoding, mime types, and
13//! tokenizing text for search operations.
14//!
15//! ## Secondary Responsibilities
16//!
17//! - File encoding detection (UTF-8, UTF-16, ASCII)
18//! - MIME type detection from extensions and content
19//! - Content tokenization for search indexing
20//! - Language detection for code analysis
21//!
22//! ## Dependencies
23//!
24//! **External Crates:**
25//! - None (uses std library)
26//!
27//! **Internal Modules:**
28//! - `crate::Result` - Error handling type
29//!
30//! ## Dependents
31//!
32//! - `Indexing::Scan::ScanFile` - Content processing during file scan
33//! - `Indexing::Store::StoreEntry` - Index storage operations
34//!
35//! ## VSCode Pattern Reference
36//!
37//! Inspired by VSCode's content processing in
38//! `src/vs/base/node/encoding/`
39//!
40//! ## Security Considerations
41//!
42//! - Safe BOM marker detection
43//! - Null byte filtering
44//! - Length limits on processed content
45//!
46//! ## Performance Considerations
47//!
48//! - Efficient tokenization with minimal allocations
49//! - Early termination for binary files
50//! - Lazy content evaluation
51//!
52//! ## Error Handling Strategy
53//!
54//! Content processing functions return Option or safe defaults when
55//! detection fails, rather than errors, to allow indexing to continue.
56//!
57//! ## Thread Safety
58//!
59//! Content processing functions are pure and safe to call from
60//! parallel indexing tasks.
61
62use std::path::PathBuf;
63
64use crate::Result;
65
66/// Detect file encoding (simplified detection)
67pub fn DetectEncoding(content:&[u8]) -> Option<String> {
68	if content.is_empty() {
69		return None;
70	}
71
72	// Check for BOM markers
73	if content.starts_with(&[0xEF, 0xBB, 0xBF]) {
74		return Some("UTF-8 (BOM)".to_string());
75	}
76
77	if content.starts_with(&[0xFE, 0xFF]) {
78		return Some("UTF-16 (BE)".to_string());
79	}
80
81	if content.starts_with(&[0xFF, 0xFE]) {
82		return Some("UTF-16 (LE)".to_string());
83	}
84
85	if content.starts_with(&[0x00, 0x00, 0xFE, 0xFF]) {
86		return Some("UTF-32 (BE)".to_string());
87	}
88
89	if content.starts_with(&[0xFF, 0xFE, 0x00, 0x00]) {
90		return Some("UTF-32 (LE)".to_string());
91	}
92
93	// Check if all bytes are ASCII
94	if content.iter().all(|&b| b.is_ascii()) {
95		return Some("ASCII".to_string());
96	}
97
98	// Assume UTF-8 for other cases
99	Some("UTF-8".to_string())
100}
101
102/// Detect MIME type with comprehensive file type detection
103pub fn DetectMimeType(file_path:&PathBuf, content:&[u8]) -> String {
104	if let Some(extension) = file_path.extension() {
105		match extension.to_string_lossy().to_lowercase().as_str() {
106			"rs" => "text/x-rust".to_string(),
107			"ts" => "text/x-typescript".to_string(),
108			"tsx" => "text/typescript-jsx".to_string(),
109			"js" => "text/javascript".to_string(),
110			"jsx" => "text/javascript-jsx".to_string(),
111			"mjs" => "text/javascript".to_string(),
112			"cjs" => "text/javascript".to_string(),
113			"json" => "application/json".to_string(),
114			"jsonc" => "application/json+comments".to_string(),
115			"toml" => "text/x-toml".to_string(),
116			"yaml" | "yml" => "text/x-yaml".to_string(),
117			"md" => "text/markdown".to_string(),
118			"mdx" => "text/markdown-jsx".to_string(),
119			"txt" => "text/plain".to_string(),
120			"html" | "htm" => "text/html".to_string(),
121			"css" => "text/css".to_string(),
122			"scss" => "text/x-scss".to_string(),
123			"sass" => "text/x-sass".to_string(),
124			"less" => "text/x-less".to_string(),
125			"xml" => "application/xml".to_string(),
126			"py" => "text/x-python".to_string(),
127			"java" => "text/x-java".to_string(),
128			"go" => "text/x-go".to_string(),
129			"sh" => "text/x-shellscript".to_string(),
130			"bash" => "text/x-shellscript".to_string(),
131			"zsh" => "text/x-shellscript".to_string(),
132			"fish" => "text/x-shellscript".to_string(),
133			"rb" => "text/x-ruby".to_string(),
134			"php" => "text/x-php".to_string(),
135			"swift" => "text/x-swift".to_string(),
136			"kt" | "kts" => "text/x-kotlin".to_string(),
137			"scala" => "text/x-scala".to_string(),
138			"cs" => "text/x-csharp".to_string(),
139			"vb" => "text/x-vbnet".to_string(),
140			"f#" => "text/x-fsharp".to_string(),
141			"r" => "text/x-r".to_string(),
142			"lua" => "text/x-lua".to_string(),
143			"pl" => "text/x-perl".to_string(),
144			"ps1" => "text/x-powershell".to_string(),
145			"sql" => "text/x-sql".to_string(),
146			"graphql" | "gql" => "application/graphql".to_string(),
147			"graphqls" => "application/graphql".to_string(),
148			"proto" => "text/x-protobuf".to_string(),
149			"wasm" => "application/wasm".to_string(),
150			"wat" => "text/x-wat".to_string(),
151			"lock" => "application/json".to_string(),
152			"graphqlconfig" => "application/json".to_string(),
153			"graphqlrc" => "application/json".to_string(),
154			"graphqlconfig.yaml" | "graphqlrc.yaml" => "text/x-yaml".to_string(),
155			"graphqlrc.yml" => "text/x-yaml".to_string(),
156			"graphqlconfig.json" | "graphqlrc.json" => "application/json".to_string(),
157			"graphqlconfig.js" | "graphqlrc.js" => "text/javascript".to_string(),
158			"graphqlconfig.ts" | "graphqlrc.ts" => "text/x-typescript".to_string(),
159			"graphqlconfig.toml" | "graphqlrc.toml" => "text/x-toml".to_string(),
160			_ => {
161				// Use content-based detection
162				DetectMimeTypeFromContent(content)
163			},
164		}
165	} else {
166		// No extension, try content-based detection
167		DetectMimeTypeFromContent(content)
168	}
169}
170
171/// Detect MIME type from content (magic numbers)
172fn DetectMimeTypeFromContent(content:&[u8]) -> String {
173	if content.is_empty() {
174		return "application/octet-stream".to_string();
175	}
176
177	if content.starts_with(b"{") || content.starts_with(b"[") {
178		"application/json".to_string()
179	} else if content.starts_with(b"#!") {
180		"text/x-shellscript".to_string()
181	} else if content.starts_with(b"<?xml") {
182		"application/xml".to_string()
183	} else if content.starts_with(b"<!DOCTYPE") || content.starts_with(b"<html") {
184		"text/html".to_string()
185	} else if content.starts_with(b"---") {
186		"text/x-yaml".to_string()
187	} else if content.is_ascii() && !content.windows(4).any(|w| w.starts_with(&[0u8])) {
188		"text/plain".to_string()
189	} else {
190		"application/octet-stream".to_string()
191	}
192}
193
194/// Detect programming language from file extension and shebang
195pub fn DetectLanguage(file_path:&PathBuf) -> Option<String> {
196	if let Some(extension) = file_path.extension() {
197		let lang = match extension.to_string_lossy().to_lowercase().as_str() {
198			"rs" => "rust",
199			"ts" | "tsx" => "typescript",
200			"js" | "jsx" | "mjs" | "cjs" => "javascript",
201			"json" | "jsonc" | "graphqlconfig" | "graphqlrc" | "lock" => "json",
202			"toml" | "graphqlconfig.toml" | "graphqlrc.toml" => "toml",
203			"yaml" | "yml" | "graphqlconfig.yaml" | "graphqlrc.yaml" | "graphqlrc.yml" => "yaml",
204			"md" | "mdx" => "markdown",
205			"txt" => "plaintext",
206			"html" | "htm" => "html",
207			"css" => "css",
208			"scss" => "scss",
209			"sass" => "sass",
210			"less" => "less",
211			"xml" => "xml",
212			"py" => "python",
213			"java" => "java",
214			"go" => "go",
215			"sh" | "bash" => "shellscript",
216			"zsh" => "shellscript",
217			"fish" => "fish",
218			"rb" => "ruby",
219			"php" => "php",
220			"swift" => "swift",
221			"kt" | "kts" => "kotlin",
222			"scala" => "scala",
223			"cpp" | "cc" | "cxx" | "hpp" | "hxx" => "cpp",
224			"c" | "h" => "c",
225			"cs" => "csharp",
226			"vb" => "vb",
227			"f#" | "fs" | "fsi" | "fsx" => "fsharp",
228			"r" | "rmd" => "r",
229			"jl" => "julia",
230			"lua" => "lua",
231			"pl" => "perl",
232			"ps1" | "psm1" | "psd1" => "powershell",
233			"sql" => "sql",
234			"graphql" | "gql" | "graphqls" => "graphql",
235			"proto" => "protobuf",
236			"wasm" => "wasm",
237			"wat" => "wat",
238			"clj" | "cljs" | "cljc" | "edn" => "clojure",
239			"hs" | "lhs" => "haskell",
240			"erl" | "hrl" => "erlang",
241			"ex" | "exs" => "elixir",
242			"dart" => "dart",
243			"nim" => "nim",
244			"v" => "v",
245			"zig" => "zig",
246			"odin" => "odin",
247			"mojo" => "mojo",
248			_ => return None,
249		};
250		return Some(lang.to_string());
251	}
252
253	// Try to detect from shebang
254	if let Ok(content) = std::fs::read_to_string(file_path) {
255		if let Some(first_line) = content.lines().next() {
256			if first_line.starts_with("#!") {
257				let shebang_path = first_line.split_whitespace().nth(1).unwrap_or("");
258				let lang = match shebang_path.rsplit('/').next().unwrap_or("") {
259					"bash" => "shellscript",
260					"sh" => "shellscript",
261					"zsh" => "shellscript",
262					"fish" => "fish",
263					"python" | "python2" | "python3" => "python",
264					"node" => "javascript",
265					"ruby" => "ruby",
266					"perl" => "perl",
267					"php" => "php",
268					"lua" => "lua",
269					"r" | "Rscript" => "r",
270					"julia" => "julia",
271					"rust" | "rustc" => "rust",
272					"go" => "go",
273					"java" => "java",
274					"scala" | "scalac" => "scala",
275					"kotlin" | "kotlinc" => "kotlin",
276					"swift" => "swift",
277					_ => return None,
278				};
279				return Some(lang.to_string());
280			}
281		}
282	}
283
284	None
285}
286
287/// Tokenize content for indexing with improved word boundary handling
288pub fn TokenizeContent(content:&str) -> Vec<String> {
289	let mut tokens = Vec::new();
290	let mut current_token = String::new();
291	let mut in_token = false;
292
293	for c in content.chars() {
294		if c.is_alphanumeric() || c == '_' {
295			current_token.push(c);
296			in_token = true;
297		} else if in_token {
298			// End of token
299			tokens.push(current_token.to_lowercase());
300			current_token.clear();
301			in_token = false;
302		}
303	}
304
305	// Don't forget the last token
306	if in_token {
307		tokens.push(current_token.to_lowercase());
308	}
309
310	tokens
311}
312
313/// Remove null bytes and control characters from content
314pub fn SanitizeContent(content:&str) -> String { content.chars().filter(|c| *c != '\0' && !c.is_control()).collect() }
315
316/// Convert content to UTF-8 string with error handling
317pub fn ContentToString(content:&[u8]) -> Result<String> {
318	String::from_utf8(content.to_vec())
319		.map_err(|e| crate::AirError::FileSystem(format!("Invalid UTF-8 content: {}", e)))
320}
321
322/// Check if content is likely binary (contains null bytes or high ratio of
323/// non-text)
324pub fn IsBinaryContent(content:&[u8]) -> bool {
325	const MAX_NULL_BYTES:usize = 10;
326	const BINARY_SCAN_LIMIT:usize = 8000;
327
328	let scan_length = content.len().min(BINARY_SCAN_LIMIT);
329	let null_count = content[..scan_length].iter().filter(|&&b| b == 0).count();
330
331	if null_count > MAX_NULL_BYTES {
332		return true;
333	}
334
335	// Check for high ratio of non-text bytes in first chunk
336	let scan_bytes = &content[..scan_length];
337	let text_ratio = scan_bytes
338		.iter()
339		.filter(|&&b| b.is_ascii_graphic() || b.is_ascii_whitespace() || b >= 0x80)
340		.count() as f64
341		/ scan_length as f64;
342
343	text_ratio < 0.7
344}
345
346/// Get line count from content
347pub fn GetLineCount(content:&str) -> u32 {
348	if content.is_empty() {
349		return 0;
350	}
351	content.lines().count() as u32
352}
353
354/// Get char count from content
355pub fn GetCharCount(content:&str) -> usize { content.chars().count() }
356
357/// Truncate content to specified maximum size in characters
358pub fn TruncateContent(content:&str, max_chars:usize) -> String {
359	let chars:Vec<char> = content.chars().take(max_chars).collect();
360	chars.into_iter().collect()
361}