AirLibrary/Indexing/Process/
ProcessContent.rs1use std::path::PathBuf;
63
64use crate::Result;
65
66pub fn DetectEncoding(content:&[u8]) -> Option<String> {
68 if content.is_empty() {
69 return None;
70 }
71
72 if content.starts_with(&[0xEF, 0xBB, 0xBF]) {
74 return Some("UTF-8 (BOM)".to_string());
75 }
76
77 if content.starts_with(&[0xFE, 0xFF]) {
78 return Some("UTF-16 (BE)".to_string());
79 }
80
81 if content.starts_with(&[0xFF, 0xFE]) {
82 return Some("UTF-16 (LE)".to_string());
83 }
84
85 if content.starts_with(&[0x00, 0x00, 0xFE, 0xFF]) {
86 return Some("UTF-32 (BE)".to_string());
87 }
88
89 if content.starts_with(&[0xFF, 0xFE, 0x00, 0x00]) {
90 return Some("UTF-32 (LE)".to_string());
91 }
92
93 if content.iter().all(|&b| b.is_ascii()) {
95 return Some("ASCII".to_string());
96 }
97
98 Some("UTF-8".to_string())
100}
101
102pub fn DetectMimeType(file_path:&PathBuf, content:&[u8]) -> String {
104 if let Some(extension) = file_path.extension() {
105 match extension.to_string_lossy().to_lowercase().as_str() {
106 "rs" => "text/x-rust".to_string(),
107 "ts" => "text/x-typescript".to_string(),
108 "tsx" => "text/typescript-jsx".to_string(),
109 "js" => "text/javascript".to_string(),
110 "jsx" => "text/javascript-jsx".to_string(),
111 "mjs" => "text/javascript".to_string(),
112 "cjs" => "text/javascript".to_string(),
113 "json" => "application/json".to_string(),
114 "jsonc" => "application/json+comments".to_string(),
115 "toml" => "text/x-toml".to_string(),
116 "yaml" | "yml" => "text/x-yaml".to_string(),
117 "md" => "text/markdown".to_string(),
118 "mdx" => "text/markdown-jsx".to_string(),
119 "txt" => "text/plain".to_string(),
120 "html" | "htm" => "text/html".to_string(),
121 "css" => "text/css".to_string(),
122 "scss" => "text/x-scss".to_string(),
123 "sass" => "text/x-sass".to_string(),
124 "less" => "text/x-less".to_string(),
125 "xml" => "application/xml".to_string(),
126 "py" => "text/x-python".to_string(),
127 "java" => "text/x-java".to_string(),
128 "go" => "text/x-go".to_string(),
129 "sh" => "text/x-shellscript".to_string(),
130 "bash" => "text/x-shellscript".to_string(),
131 "zsh" => "text/x-shellscript".to_string(),
132 "fish" => "text/x-shellscript".to_string(),
133 "rb" => "text/x-ruby".to_string(),
134 "php" => "text/x-php".to_string(),
135 "swift" => "text/x-swift".to_string(),
136 "kt" | "kts" => "text/x-kotlin".to_string(),
137 "scala" => "text/x-scala".to_string(),
138 "cs" => "text/x-csharp".to_string(),
139 "vb" => "text/x-vbnet".to_string(),
140 "f#" => "text/x-fsharp".to_string(),
141 "r" => "text/x-r".to_string(),
142 "lua" => "text/x-lua".to_string(),
143 "pl" => "text/x-perl".to_string(),
144 "ps1" => "text/x-powershell".to_string(),
145 "sql" => "text/x-sql".to_string(),
146 "graphql" | "gql" => "application/graphql".to_string(),
147 "graphqls" => "application/graphql".to_string(),
148 "proto" => "text/x-protobuf".to_string(),
149 "wasm" => "application/wasm".to_string(),
150 "wat" => "text/x-wat".to_string(),
151 "lock" => "application/json".to_string(),
152 "graphqlconfig" => "application/json".to_string(),
153 "graphqlrc" => "application/json".to_string(),
154 "graphqlconfig.yaml" | "graphqlrc.yaml" => "text/x-yaml".to_string(),
155 "graphqlrc.yml" => "text/x-yaml".to_string(),
156 "graphqlconfig.json" | "graphqlrc.json" => "application/json".to_string(),
157 "graphqlconfig.js" | "graphqlrc.js" => "text/javascript".to_string(),
158 "graphqlconfig.ts" | "graphqlrc.ts" => "text/x-typescript".to_string(),
159 "graphqlconfig.toml" | "graphqlrc.toml" => "text/x-toml".to_string(),
160 _ => {
161 DetectMimeTypeFromContent(content)
163 },
164 }
165 } else {
166 DetectMimeTypeFromContent(content)
168 }
169}
170
171fn DetectMimeTypeFromContent(content:&[u8]) -> String {
173 if content.is_empty() {
174 return "application/octet-stream".to_string();
175 }
176
177 if content.starts_with(b"{") || content.starts_with(b"[") {
178 "application/json".to_string()
179 } else if content.starts_with(b"#!") {
180 "text/x-shellscript".to_string()
181 } else if content.starts_with(b"<?xml") {
182 "application/xml".to_string()
183 } else if content.starts_with(b"<!DOCTYPE") || content.starts_with(b"<html") {
184 "text/html".to_string()
185 } else if content.starts_with(b"---") {
186 "text/x-yaml".to_string()
187 } else if content.is_ascii() && !content.windows(4).any(|w| w.starts_with(&[0u8])) {
188 "text/plain".to_string()
189 } else {
190 "application/octet-stream".to_string()
191 }
192}
193
194pub fn DetectLanguage(file_path:&PathBuf) -> Option<String> {
196 if let Some(extension) = file_path.extension() {
197 let lang = match extension.to_string_lossy().to_lowercase().as_str() {
198 "rs" => "rust",
199 "ts" | "tsx" => "typescript",
200 "js" | "jsx" | "mjs" | "cjs" => "javascript",
201 "json" | "jsonc" | "graphqlconfig" | "graphqlrc" | "lock" => "json",
202 "toml" | "graphqlconfig.toml" | "graphqlrc.toml" => "toml",
203 "yaml" | "yml" | "graphqlconfig.yaml" | "graphqlrc.yaml" | "graphqlrc.yml" => "yaml",
204 "md" | "mdx" => "markdown",
205 "txt" => "plaintext",
206 "html" | "htm" => "html",
207 "css" => "css",
208 "scss" => "scss",
209 "sass" => "sass",
210 "less" => "less",
211 "xml" => "xml",
212 "py" => "python",
213 "java" => "java",
214 "go" => "go",
215 "sh" | "bash" => "shellscript",
216 "zsh" => "shellscript",
217 "fish" => "fish",
218 "rb" => "ruby",
219 "php" => "php",
220 "swift" => "swift",
221 "kt" | "kts" => "kotlin",
222 "scala" => "scala",
223 "cpp" | "cc" | "cxx" | "hpp" | "hxx" => "cpp",
224 "c" | "h" => "c",
225 "cs" => "csharp",
226 "vb" => "vb",
227 "f#" | "fs" | "fsi" | "fsx" => "fsharp",
228 "r" | "rmd" => "r",
229 "jl" => "julia",
230 "lua" => "lua",
231 "pl" => "perl",
232 "ps1" | "psm1" | "psd1" => "powershell",
233 "sql" => "sql",
234 "graphql" | "gql" | "graphqls" => "graphql",
235 "proto" => "protobuf",
236 "wasm" => "wasm",
237 "wat" => "wat",
238 "clj" | "cljs" | "cljc" | "edn" => "clojure",
239 "hs" | "lhs" => "haskell",
240 "erl" | "hrl" => "erlang",
241 "ex" | "exs" => "elixir",
242 "dart" => "dart",
243 "nim" => "nim",
244 "v" => "v",
245 "zig" => "zig",
246 "odin" => "odin",
247 "mojo" => "mojo",
248 _ => return None,
249 };
250 return Some(lang.to_string());
251 }
252
253 if let Ok(content) = std::fs::read_to_string(file_path) {
255 if let Some(first_line) = content.lines().next() {
256 if first_line.starts_with("#!") {
257 let shebang_path = first_line.split_whitespace().nth(1).unwrap_or("");
258 let lang = match shebang_path.rsplit('/').next().unwrap_or("") {
259 "bash" => "shellscript",
260 "sh" => "shellscript",
261 "zsh" => "shellscript",
262 "fish" => "fish",
263 "python" | "python2" | "python3" => "python",
264 "node" => "javascript",
265 "ruby" => "ruby",
266 "perl" => "perl",
267 "php" => "php",
268 "lua" => "lua",
269 "r" | "Rscript" => "r",
270 "julia" => "julia",
271 "rust" | "rustc" => "rust",
272 "go" => "go",
273 "java" => "java",
274 "scala" | "scalac" => "scala",
275 "kotlin" | "kotlinc" => "kotlin",
276 "swift" => "swift",
277 _ => return None,
278 };
279 return Some(lang.to_string());
280 }
281 }
282 }
283
284 None
285}
286
287pub fn TokenizeContent(content:&str) -> Vec<String> {
289 let mut tokens = Vec::new();
290 let mut current_token = String::new();
291 let mut in_token = false;
292
293 for c in content.chars() {
294 if c.is_alphanumeric() || c == '_' {
295 current_token.push(c);
296 in_token = true;
297 } else if in_token {
298 tokens.push(current_token.to_lowercase());
300 current_token.clear();
301 in_token = false;
302 }
303 }
304
305 if in_token {
307 tokens.push(current_token.to_lowercase());
308 }
309
310 tokens
311}
312
313pub fn SanitizeContent(content:&str) -> String { content.chars().filter(|c| *c != '\0' && !c.is_control()).collect() }
315
316pub fn ContentToString(content:&[u8]) -> Result<String> {
318 String::from_utf8(content.to_vec())
319 .map_err(|e| crate::AirError::FileSystem(format!("Invalid UTF-8 content: {}", e)))
320}
321
322pub fn IsBinaryContent(content:&[u8]) -> bool {
325 const MAX_NULL_BYTES:usize = 10;
326 const BINARY_SCAN_LIMIT:usize = 8000;
327
328 let scan_length = content.len().min(BINARY_SCAN_LIMIT);
329 let null_count = content[..scan_length].iter().filter(|&&b| b == 0).count();
330
331 if null_count > MAX_NULL_BYTES {
332 return true;
333 }
334
335 let scan_bytes = &content[..scan_length];
337 let text_ratio = scan_bytes
338 .iter()
339 .filter(|&&b| b.is_ascii_graphic() || b.is_ascii_whitespace() || b >= 0x80)
340 .count() as f64
341 / scan_length as f64;
342
343 text_ratio < 0.7
344}
345
346pub fn GetLineCount(content:&str) -> u32 {
348 if content.is_empty() {
349 return 0;
350 }
351 content.lines().count() as u32
352}
353
354pub fn GetCharCount(content:&str) -> usize { content.chars().count() }
356
357pub fn TruncateContent(content:&str, max_chars:usize) -> String {
359 let chars:Vec<char> = content.chars().take(max_chars).collect();
360 chars.into_iter().collect()
361}