// Developed by Manfred Lotz in cooperation with Claude (Anthropic). use anyhow::{Context, Result}; use std::fmt; use std::fs::File; use std::io::Read; use std::path::Path; #[derive(Debug, Clone)] pub enum LineEnding { Lf, CrLf, Cr, Mixed(usize, usize, usize), // (cr, lf, crlf) } impl fmt::Display for LineEnding { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { LineEnding::Lf => write!(f, "LF"), LineEnding::CrLf => write!(f, "CRLF"), LineEnding::Cr => write!(f, "CR"), LineEnding::Mixed(cr, lf, crlf) => { write!(f, "mixed endings ({cr} CR, {lf} LF, {crlf} CRLF)") } } } } #[derive(Debug, Clone)] pub enum BomKind { Utf8, Utf16Le, Utf16Be, Utf32Le, Utf32Be, } impl fmt::Display for BomKind { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { BomKind::Utf8 => write!(f, "UTF-8 BOM"), BomKind::Utf16Le => write!(f, "UTF-16 LE BOM"), BomKind::Utf16Be => write!(f, "UTF-16 BE BOM"), BomKind::Utf32Le => write!(f, "UTF-32 LE BOM"), BomKind::Utf32Be => write!(f, "UTF-32 BE BOM"), } } } #[derive(Debug, Clone)] pub enum DetectResult { Pdf, Elf, PostScript, Zip, Archive, Png, Mime(String), // infer recognised something we don't have a dedicated variant for Text(LineEnding), Script(LineEnding, String), // (line ending, interpreter) Bom(BomKind), UnknownBinary, Zerofile, Directory, BrokenSymlink, Skipped, } impl fmt::Display for DetectResult { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { DetectResult::Pdf => write!(f, "PDF"), DetectResult::Elf => write!(f, "ELF binary"), DetectResult::PostScript => write!(f, "PostScript"), DetectResult::Zip => write!(f, "ZIP"), DetectResult::Archive => write!(f, "archive"), DetectResult::Png => write!(f, "PNG image"), DetectResult::Mime(m) => write!(f, "{m}"), DetectResult::Text(le) => write!(f, "text ({le})"), DetectResult::Script(le, interp) => write!(f, "script ({le}) {interp}"), DetectResult::Bom(b) => write!(f, "{b}"), DetectResult::UnknownBinary => write!(f, "unknown binary"), DetectResult::Zerofile => write!(f, "zero-length file"), DetectResult::Directory => write!(f, "directory"), DetectResult::BrokenSymlink => write!(f, "broken symlink"), DetectResult::Skipped => write!(f, "skipped"), } } } fn detect_line_endings(bytes: &[u8]) -> LineEnding { let mut crlf = 0usize; let mut lf = 0usize; let mut cr = 0usize; let mut i = 0; while i < bytes.len() { match bytes[i] { b'\r' if i + 1 < bytes.len() && bytes[i + 1] == b'\n' => { crlf += 1; i += 2; } b'\r' => { cr += 1; i += 1; } b'\n' => { lf += 1; i += 1; } _ => i += 1, } } match (crlf > 0, lf > 0, cr > 0) { (true, false, false) => LineEnding::CrLf, (false, true, false) => LineEnding::Lf, (false, false, true) => LineEnding::Cr, (false, false, false) => LineEnding::Lf, // no line endings at all, default _ => LineEnding::Mixed(cr, lf, crlf), } } fn is_v7_tar(buf: &[u8]) -> bool { if buf.len() < 512 { return false; } let computed: u32 = buf[..512] .iter() .enumerate() .map(|(i, &b)| { if (148..156).contains(&i) { 0x20u32 } else { b as u32 } }) .sum(); let Ok(s) = std::str::from_utf8(&buf[148..156]) else { return false; }; let s = s.trim_matches(|c: char| c == '\0' || c == ' '); u32::from_str_radix(s, 8).is_ok_and(|stored| stored == computed) } fn is_text_content(bytes: &[u8]) -> bool { if bytes.contains(&0u8) { return false; } if std::str::from_utf8(bytes).is_ok() { return true; } let printable = bytes .iter() .filter(|&&b| b >= 0x20 || matches!(b, b'\n' | b'\r' | b'\t')) .count(); printable * 100 / bytes.len() > 90 } fn extract_interpreter(buf: &[u8]) -> String { let line = buf[2..] .split(|&b| b == b'\n' || b == b'\r') .next() .unwrap_or(&[]); std::str::from_utf8(line) .unwrap_or("") .split_whitespace() .next() .unwrap_or("") .to_string() } fn detect_bom(buf: &[u8]) -> Option { // UTF-32 must be checked before UTF-16 — they share a leading byte pair if buf.starts_with(&[0xFF, 0xFE, 0x00, 0x00]) { Some(BomKind::Utf32Le) } else if buf.starts_with(&[0x00, 0x00, 0xFE, 0xFF]) { Some(BomKind::Utf32Be) } else if buf.starts_with(&[0xEF, 0xBB, 0xBF]) { Some(BomKind::Utf8) } else if buf.starts_with(&[0xFF, 0xFE]) { Some(BomKind::Utf16Le) } else if buf.starts_with(&[0xFE, 0xFF]) { Some(BomKind::Utf16Be) } else { None } } fn classify_mime(mime: &str, buf: &[u8]) -> DetectResult { match mime { "application/pdf" => DetectResult::Pdf, // PostScript stub: %! is only 2 bytes — must have content beyond the signature "application/postscript" if buf.len() <= 2 => { if is_text_content(buf) { DetectResult::Text(detect_line_endings(buf)) } else { DetectResult::UnknownBinary } } // infer triggers on %! alone; real PostScript starts with %!PS — anything else // (e.g. %!TEX magic comments in TeX files) is just text "application/postscript" if !buf.starts_with(b"%!PS") => { if is_text_content(buf) { DetectResult::Text(detect_line_endings(buf)) } else { DetectResult::UnknownBinary } } "application/postscript" => DetectResult::PostScript, "application/zip" | "application/x-zip-compressed" => DetectResult::Zip, "application/gzip" | "application/x-tar" | "application/x-bzip2" | "application/x-xz" | "application/zstd" => DetectResult::Archive, // PNG stub: signature is 8 bytes — must have content beyond the signature "image/png" if buf.len() <= 8 => DetectResult::UnknownBinary, "image/png" => DetectResult::Png, "application/x-executable" | "application/x-pie-executable" | "application/x-sharedlib" | "application/x-mach-binary" => DetectResult::Elf, "text/x-shellscript" => { DetectResult::Script(detect_line_endings(buf), extract_interpreter(buf)) } _ => DetectResult::Mime(mime.to_string()), } } pub fn detect(path: &Path) -> Result { let ft = path .symlink_metadata() .with_context(|| format!("stat {}", path.display()))? .file_type(); if ft.is_dir() { return Ok(DetectResult::Directory); } if ft.is_symlink() { return Ok(if path.exists() { DetectResult::Skipped } else { DetectResult::BrokenSymlink }); } if !ft.is_file() { return Ok(DetectResult::Skipped); } let mut buf = [0u8; 8192]; let n = File::open(path) .with_context(|| format!("opening {}", path.display()))? .read(&mut buf)?; let buf = &buf[..n]; if buf.is_empty() { return Ok(DetectResult::Zerofile); } if let Some(bom) = detect_bom(buf) { return Ok(DetectResult::Bom(bom)); } // DOS EPS binary preview (EPSI): binary wrapper around PostScript + TIFF if buf.starts_with(&[0xC5, 0xD0, 0xD3, 0xC6]) { return Ok(DetectResult::PostScript); } if let Some(t) = infer::get(buf) { return Ok(classify_mime(t.mime_type(), buf)); } // V7/pre-POSIX tar has no ustar magic; detect via header checksum validation. // Placed before LZMA so a tar whose first filename byte is ≤ 224 doesn't fall through. if is_v7_tar(buf) { return Ok(DetectResult::Archive); } // LZMA old format has no distinctive magic; detect via heuristic: properties byte ≤ 224 // and a 32-bit LE dictionary size that is a non-zero power of two. Placed after infer so // formats sharing the leading-byte pattern (e.g. TrueType 00 01 00 00) are already handled. if buf.len() >= 5 && buf[0] <= 224 { let dict = u32::from_le_bytes([buf[1], buf[2], buf[3], buf[4]]); if dict != 0 && dict.is_power_of_two() { return Ok(DetectResult::Archive); } } if !is_text_content(buf) { return Ok(DetectResult::UnknownBinary); } let le = detect_line_endings(buf); if buf.starts_with(b"#!") { Ok(DetectResult::Script(le, extract_interpreter(buf))) } else { Ok(DetectResult::Text(le)) } } #[cfg(test)] mod tests { use super::*; use std::fs; use std::os::unix::fs::symlink; // ── detect_line_endings ────────────────────────────────────────────────── #[test] fn line_endings_lf() { assert!(matches!(detect_line_endings(b"foo\nbar\n"), LineEnding::Lf)); } #[test] fn line_endings_crlf() { assert!(matches!( detect_line_endings(b"foo\r\nbar\r\n"), LineEnding::CrLf )); } #[test] fn line_endings_cr() { assert!(matches!(detect_line_endings(b"foo\rbar\r"), LineEnding::Cr)); } #[test] fn line_endings_mixed() { // 1 CR, 1 LF, 1 CRLF assert!(matches!( detect_line_endings(b"foo\nbar\r\nbaz\r"), LineEnding::Mixed(1, 1, 1) )); } #[test] fn line_endings_none_defaults_lf() { assert!(matches!( detect_line_endings(b"no newlines"), LineEnding::Lf )); } // ── detect_bom ────────────────────────────────────────────────────────── #[test] fn bom_utf8() { assert!(matches!( detect_bom(&[0xEF, 0xBB, 0xBF, b'a']), Some(BomKind::Utf8) )); } #[test] fn bom_utf16_le() { assert!(matches!( detect_bom(&[0xFF, 0xFE, b'a', 0x00]), Some(BomKind::Utf16Le) )); } #[test] fn bom_utf16_be() { assert!(matches!( detect_bom(&[0xFE, 0xFF, 0x00, b'a']), Some(BomKind::Utf16Be) )); } #[test] fn bom_utf32_le_not_confused_with_utf16_le() { // \xFF\xFE\x00\x00 must be UTF-32 LE, not UTF-16 LE assert!(matches!( detect_bom(&[0xFF, 0xFE, 0x00, 0x00]), Some(BomKind::Utf32Le) )); } #[test] fn bom_utf32_be() { assert!(matches!( detect_bom(&[0x00, 0x00, 0xFE, 0xFF]), Some(BomKind::Utf32Be) )); } #[test] fn bom_none() { assert!(detect_bom(b"hello world").is_none()); } // ── is_text_content ───────────────────────────────────────────────────── #[test] fn text_content_valid_utf8() { assert!(is_text_content(b"hello, world!\n")); } #[test] fn text_content_null_byte_is_binary() { assert!(!is_text_content(b"hello\x00world")); } #[test] fn text_content_high_control_bytes_is_binary() { assert!(!is_text_content(&[ 0x80, 0x81, 0x82, 0x83, 0x01, 0x02, 0x03, 0x04 ])); } // ── extract_interpreter ────────────────────────────────────────────────── #[test] fn interpreter_env() { assert_eq!( extract_interpreter(b"#!/usr/bin/env python3\ncode"), "/usr/bin/env" ); } #[test] fn interpreter_direct() { assert_eq!(extract_interpreter(b"#!/bin/bash\n"), "/bin/bash"); } #[test] fn interpreter_with_flag_strips_args() { assert_eq!( extract_interpreter(b"#!/usr/bin/perl -w\n"), "/usr/bin/perl" ); } // ── classify_mime ──────────────────────────────────────────────────────── #[test] fn classify_pdf() { assert!(matches!( classify_mime("application/pdf", b"%PDF-1.4"), DetectResult::Pdf )); } #[test] fn classify_postscript_with_content() { assert!(matches!( classify_mime("application/postscript", b"%!PS-Adobe-3.0\nextra"), DetectResult::PostScript )); } #[test] fn classify_tex_magic_comment_is_text() { // %!TEX shares the %! prefix with PostScript but is not PostScript let buf = b"%!TEX TS-program = Arara\n\\documentclass{article}"; assert!(matches!( classify_mime("application/postscript", buf), DetectResult::Text(_) )); } #[test] fn classify_postscript_stub_reclassified_as_text() { // exactly 2 bytes — only the signature, no content assert!(matches!( classify_mime("application/postscript", b"%!"), DetectResult::Text(_) )); } #[test] fn classify_zip() { assert!(matches!( classify_mime("application/zip", b"PK\x03\x04data"), DetectResult::Zip )); } #[test] fn classify_gzip_archive() { assert!(matches!( classify_mime("application/gzip", b"\x1f\x8bdata"), DetectResult::Archive )); } #[test] fn classify_png_with_content() { let mut buf = vec![0x89, b'P', b'N', b'G', b'\r', b'\n', 0x1a, b'\n']; buf.extend_from_slice(b"IHDR_data"); assert!(matches!( classify_mime("image/png", &buf), DetectResult::Png )); } #[test] fn classify_png_stub_reclassified_as_unknown_binary() { // exactly the 8-byte PNG signature, no content let buf = [0x89, b'P', b'N', b'G', b'\r', b'\n', 0x1a, b'\n']; assert!(matches!( classify_mime("image/png", &buf), DetectResult::UnknownBinary )); } #[test] fn classify_elf_executable() { assert!(matches!( classify_mime("application/x-executable", b"\x7fELF"), DetectResult::Elf )); } #[test] fn classify_shellscript_produces_script_variant() { let buf = b"#!/bin/bash\necho hi\n"; assert!(matches!( classify_mime("text/x-shellscript", buf), DetectResult::Script(_, _) )); } #[test] fn classify_unknown_mime_preserved() { assert!(matches!( classify_mime("application/octet-stream", b"\x00\x01\x02"), DetectResult::Mime(ref m) if m == "application/octet-stream" )); } // ── detect() — integration tests with real files ────────────────────── #[test] fn detect_empty_file() { let dir = tempfile::tempdir().unwrap(); let path = dir.path().join("empty"); fs::write(&path, b"").unwrap(); assert!(matches!(detect(&path), Ok(DetectResult::Zerofile))); } #[test] fn detect_text_lf() { let dir = tempfile::tempdir().unwrap(); let path = dir.path().join("text.txt"); fs::write(&path, b"hello\nworld\n").unwrap(); assert!(matches!( detect(&path), Ok(DetectResult::Text(LineEnding::Lf)) )); } #[test] fn detect_shebang_script() { let dir = tempfile::tempdir().unwrap(); let path = dir.path().join("script.py"); fs::write(&path, b"#!/usr/bin/env python3\nprint('hi')\n").unwrap(); match detect(&path).unwrap() { DetectResult::Script(LineEnding::Lf, interp) => { assert_eq!(interp, "/usr/bin/env"); } other => panic!("expected Script, got {other}"), } } #[test] fn detect_utf8_bom() { let dir = tempfile::tempdir().unwrap(); let path = dir.path().join("bom.txt"); fs::write(&path, b"\xEF\xBB\xBFhello").unwrap(); assert!(matches!( detect(&path), Ok(DetectResult::Bom(BomKind::Utf8)) )); } #[test] fn detect_directory() { let dir = tempfile::tempdir().unwrap(); assert!(matches!(detect(dir.path()), Ok(DetectResult::Directory))); } #[test] fn detect_broken_symlink() { let dir = tempfile::tempdir().unwrap(); let link = dir.path().join("broken"); symlink("/nonexistent_xyz", &link).unwrap(); assert!(matches!(detect(&link), Ok(DetectResult::BrokenSymlink))); } fn make_v7_tar() -> Vec { let mut h = vec![0u8; 512]; h[..8].copy_from_slice(b"test.txt"); h[100..108].copy_from_slice(b"0000644\0"); h[108..116].copy_from_slice(b"0000000\0"); h[116..124].copy_from_slice(b"0000000\0"); h[124..136].copy_from_slice(b"00000000000\0"); h[136..148].copy_from_slice(b"00000000000\0"); h[156] = b'0'; let sum: u32 = h .iter() .enumerate() .map(|(i, &b)| { if (148..156).contains(&i) { 0x20 } else { b as u32 } }) .sum(); let s = format!("{sum:07o}\0"); h[148..156].copy_from_slice(s.as_bytes()); h } #[test] fn detect_v7_tar_is_archive() { let dir = tempfile::tempdir().unwrap(); let path = dir.path().join("archive.tar"); fs::write(&path, &make_v7_tar()).unwrap(); assert!(matches!(detect(&path), Ok(DetectResult::Archive))); } #[test] fn detect_dos_eps_is_postscript() { let dir = tempfile::tempdir().unwrap(); let path = dir.path().join("figure.eps"); // DOS EPS magic followed by dummy content let mut buf = vec![0xC5, 0xD0, 0xD3, 0xC6]; buf.extend_from_slice(b"\x00\x00\x00\x00padding data"); fs::write(&path, &buf).unwrap(); assert!(matches!(detect(&path), Ok(DetectResult::PostScript))); } #[test] fn detect_lzma_old_format_is_archive() { let dir = tempfile::tempdir().unwrap(); let path = dir.path().join("data.lzma"); // Properties byte 0x5D (93 ≤ 224) + dict size 0x00800000 (8 MiB, power of two in LE) let buf = [ 0x5D, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ]; fs::write(&path, &buf).unwrap(); assert!(matches!(detect(&path), Ok(DetectResult::Archive))); } #[test] fn detect_valid_symlink_is_skipped() { let dir = tempfile::tempdir().unwrap(); let target = dir.path().join("target.txt"); fs::write(&target, b"content").unwrap(); let link = dir.path().join("link"); symlink(&target, &link).unwrap(); assert!(matches!(detect(&link), Ok(DetectResult::Skipped))); } }