// Developed by Manfred Lotz in cooperation with Claude (Anthropic).

use anyhow::{Context, Result};
use std::fmt;
use std::fs::File;
use std::io::Read;
use std::path::Path;

#[derive(Debug, Clone)]
pub enum LineEnding {
    Lf,
    CrLf,
    Cr,
    Mixed(usize, usize, usize), // (cr, lf, crlf)
}

impl fmt::Display for LineEnding {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            LineEnding::Lf => write!(f, "LF"),
            LineEnding::CrLf => write!(f, "CRLF"),
            LineEnding::Cr => write!(f, "CR"),
            LineEnding::Mixed(cr, lf, crlf) => {
                write!(f, "mixed endings ({cr} CR, {lf} LF, {crlf} CRLF)")
            }
        }
    }
}

#[derive(Debug, Clone)]
pub enum BomKind {
    Utf8,
    Utf16Le,
    Utf16Be,
    Utf32Le,
    Utf32Be,
}

impl fmt::Display for BomKind {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            BomKind::Utf8 => write!(f, "UTF-8 BOM"),
            BomKind::Utf16Le => write!(f, "UTF-16 LE BOM"),
            BomKind::Utf16Be => write!(f, "UTF-16 BE BOM"),
            BomKind::Utf32Le => write!(f, "UTF-32 LE BOM"),
            BomKind::Utf32Be => write!(f, "UTF-32 BE BOM"),
        }
    }
}

#[derive(Debug, Clone)]
pub enum DetectResult {
    Pdf,
    Elf,
    PostScript,
    Zip,
    Archive,
    Png,
    Mime(String), // infer recognised something we don't have a dedicated variant for
    Text(LineEnding),
    Script(LineEnding, String), // (line ending, interpreter)
    Bom(BomKind),
    UnknownBinary,
    Zerofile,
    Directory,
    BrokenSymlink,
    Skipped,
}

impl fmt::Display for DetectResult {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            DetectResult::Pdf => write!(f, "PDF"),
            DetectResult::Elf => write!(f, "ELF binary"),
            DetectResult::PostScript => write!(f, "PostScript"),
            DetectResult::Zip => write!(f, "ZIP"),
            DetectResult::Archive => write!(f, "archive"),
            DetectResult::Png => write!(f, "PNG image"),
            DetectResult::Mime(m) => write!(f, "{m}"),
            DetectResult::Text(le) => write!(f, "text ({le})"),
            DetectResult::Script(le, interp) => write!(f, "script ({le}) {interp}"),
            DetectResult::Bom(b) => write!(f, "{b}"),
            DetectResult::UnknownBinary => write!(f, "unknown binary"),
            DetectResult::Zerofile => write!(f, "zero-length file"),
            DetectResult::Directory => write!(f, "directory"),
            DetectResult::BrokenSymlink => write!(f, "broken symlink"),
            DetectResult::Skipped => write!(f, "skipped"),
        }
    }
}

fn detect_line_endings(bytes: &[u8]) -> LineEnding {
    let mut crlf = 0usize;
    let mut lf = 0usize;
    let mut cr = 0usize;
    let mut i = 0;
    while i < bytes.len() {
        match bytes[i] {
            b'\r' if i + 1 < bytes.len() && bytes[i + 1] == b'\n' => {
                crlf += 1;
                i += 2;
            }
            b'\r' => {
                cr += 1;
                i += 1;
            }
            b'\n' => {
                lf += 1;
                i += 1;
            }
            _ => i += 1,
        }
    }
    match (crlf > 0, lf > 0, cr > 0) {
        (true, false, false) => LineEnding::CrLf,
        (false, true, false) => LineEnding::Lf,
        (false, false, true) => LineEnding::Cr,
        (false, false, false) => LineEnding::Lf, // no line endings at all, default
        _ => LineEnding::Mixed(cr, lf, crlf),
    }
}

fn is_v7_tar(buf: &[u8]) -> bool {
    if buf.len() < 512 {
        return false;
    }
    let computed: u32 = buf[..512]
        .iter()
        .enumerate()
        .map(|(i, &b)| {
            if (148..156).contains(&i) {
                0x20u32
            } else {
                b as u32
            }
        })
        .sum();
    let Ok(s) = std::str::from_utf8(&buf[148..156]) else {
        return false;
    };
    let s = s.trim_matches(|c: char| c == '\0' || c == ' ');
    u32::from_str_radix(s, 8).is_ok_and(|stored| stored == computed)
}

fn is_text_content(bytes: &[u8]) -> bool {
    if bytes.contains(&0u8) {
        return false;
    }
    if std::str::from_utf8(bytes).is_ok() {
        return true;
    }
    let printable = bytes
        .iter()
        .filter(|&&b| b >= 0x20 || matches!(b, b'\n' | b'\r' | b'\t'))
        .count();
    printable * 100 / bytes.len() > 90
}

fn extract_interpreter(buf: &[u8]) -> String {
    let line = buf[2..]
        .split(|&b| b == b'\n' || b == b'\r')
        .next()
        .unwrap_or(&[]);
    std::str::from_utf8(line)
        .unwrap_or("")
        .split_whitespace()
        .next()
        .unwrap_or("")
        .to_string()
}

fn detect_bom(buf: &[u8]) -> Option<BomKind> {
    // UTF-32 must be checked before UTF-16 — they share a leading byte pair
    if buf.starts_with(&[0xFF, 0xFE, 0x00, 0x00]) {
        Some(BomKind::Utf32Le)
    } else if buf.starts_with(&[0x00, 0x00, 0xFE, 0xFF]) {
        Some(BomKind::Utf32Be)
    } else if buf.starts_with(&[0xEF, 0xBB, 0xBF]) {
        Some(BomKind::Utf8)
    } else if buf.starts_with(&[0xFF, 0xFE]) {
        Some(BomKind::Utf16Le)
    } else if buf.starts_with(&[0xFE, 0xFF]) {
        Some(BomKind::Utf16Be)
    } else {
        None
    }
}

fn classify_mime(mime: &str, buf: &[u8]) -> DetectResult {
    match mime {
        "application/pdf" => DetectResult::Pdf,
        // PostScript stub: %! is only 2 bytes — must have content beyond the signature
        "application/postscript" if buf.len() <= 2 => {
            if is_text_content(buf) {
                DetectResult::Text(detect_line_endings(buf))
            } else {
                DetectResult::UnknownBinary
            }
        }
        // infer triggers on %! alone; real PostScript starts with %!PS — anything else
        // (e.g. %!TEX magic comments in TeX files) is just text
        "application/postscript" if !buf.starts_with(b"%!PS") => {
            if is_text_content(buf) {
                DetectResult::Text(detect_line_endings(buf))
            } else {
                DetectResult::UnknownBinary
            }
        }
        "application/postscript" => DetectResult::PostScript,
        "application/zip" | "application/x-zip-compressed" => DetectResult::Zip,
        "application/gzip"
        | "application/x-tar"
        | "application/x-bzip2"
        | "application/x-xz"
        | "application/zstd" => DetectResult::Archive,
        // PNG stub: signature is 8 bytes — must have content beyond the signature
        "image/png" if buf.len() <= 8 => DetectResult::UnknownBinary,
        "image/png" => DetectResult::Png,
        "application/x-executable"
        | "application/x-pie-executable"
        | "application/x-sharedlib"
        | "application/x-mach-binary" => DetectResult::Elf,
        "text/x-shellscript" => {
            DetectResult::Script(detect_line_endings(buf), extract_interpreter(buf))
        }
        _ => DetectResult::Mime(mime.to_string()),
    }
}

pub fn detect(path: &Path) -> Result<DetectResult> {
    let ft = path
        .symlink_metadata()
        .with_context(|| format!("stat {}", path.display()))?
        .file_type();

    if ft.is_dir() {
        return Ok(DetectResult::Directory);
    }
    if ft.is_symlink() {
        return Ok(if path.exists() {
            DetectResult::Skipped
        } else {
            DetectResult::BrokenSymlink
        });
    }
    if !ft.is_file() {
        return Ok(DetectResult::Skipped);
    }

    let mut buf = [0u8; 8192];
    let n = File::open(path)
        .with_context(|| format!("opening {}", path.display()))?
        .read(&mut buf)?;
    let buf = &buf[..n];

    if buf.is_empty() {
        return Ok(DetectResult::Zerofile);
    }

    if let Some(bom) = detect_bom(buf) {
        return Ok(DetectResult::Bom(bom));
    }

    // DOS EPS binary preview (EPSI): binary wrapper around PostScript + TIFF
    if buf.starts_with(&[0xC5, 0xD0, 0xD3, 0xC6]) {
        return Ok(DetectResult::PostScript);
    }

    if let Some(t) = infer::get(buf) {
        return Ok(classify_mime(t.mime_type(), buf));
    }

    // V7/pre-POSIX tar has no ustar magic; detect via header checksum validation.
    // Placed before LZMA so a tar whose first filename byte is ≤ 224 doesn't fall through.
    if is_v7_tar(buf) {
        return Ok(DetectResult::Archive);
    }

    // LZMA old format has no distinctive magic; detect via heuristic: properties byte ≤ 224
    // and a 32-bit LE dictionary size that is a non-zero power of two.  Placed after infer so
    // formats sharing the leading-byte pattern (e.g. TrueType 00 01 00 00) are already handled.
    if buf.len() >= 5 && buf[0] <= 224 {
        let dict = u32::from_le_bytes([buf[1], buf[2], buf[3], buf[4]]);
        if dict != 0 && dict.is_power_of_two() {
            return Ok(DetectResult::Archive);
        }
    }

    if !is_text_content(buf) {
        return Ok(DetectResult::UnknownBinary);
    }

    let le = detect_line_endings(buf);
    if buf.starts_with(b"#!") {
        Ok(DetectResult::Script(le, extract_interpreter(buf)))
    } else {
        Ok(DetectResult::Text(le))
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::fs;
    use std::os::unix::fs::symlink;

    // ── detect_line_endings ──────────────────────────────────────────────────

    #[test]
    fn line_endings_lf() {
        assert!(matches!(detect_line_endings(b"foo\nbar\n"), LineEnding::Lf));
    }

    #[test]
    fn line_endings_crlf() {
        assert!(matches!(
            detect_line_endings(b"foo\r\nbar\r\n"),
            LineEnding::CrLf
        ));
    }

    #[test]
    fn line_endings_cr() {
        assert!(matches!(detect_line_endings(b"foo\rbar\r"), LineEnding::Cr));
    }

    #[test]
    fn line_endings_mixed() {
        // 1 CR, 1 LF, 1 CRLF
        assert!(matches!(
            detect_line_endings(b"foo\nbar\r\nbaz\r"),
            LineEnding::Mixed(1, 1, 1)
        ));
    }

    #[test]
    fn line_endings_none_defaults_lf() {
        assert!(matches!(
            detect_line_endings(b"no newlines"),
            LineEnding::Lf
        ));
    }

    // ── detect_bom ──────────────────────────────────────────────────────────

    #[test]
    fn bom_utf8() {
        assert!(matches!(
            detect_bom(&[0xEF, 0xBB, 0xBF, b'a']),
            Some(BomKind::Utf8)
        ));
    }

    #[test]
    fn bom_utf16_le() {
        assert!(matches!(
            detect_bom(&[0xFF, 0xFE, b'a', 0x00]),
            Some(BomKind::Utf16Le)
        ));
    }

    #[test]
    fn bom_utf16_be() {
        assert!(matches!(
            detect_bom(&[0xFE, 0xFF, 0x00, b'a']),
            Some(BomKind::Utf16Be)
        ));
    }

    #[test]
    fn bom_utf32_le_not_confused_with_utf16_le() {
        // \xFF\xFE\x00\x00 must be UTF-32 LE, not UTF-16 LE
        assert!(matches!(
            detect_bom(&[0xFF, 0xFE, 0x00, 0x00]),
            Some(BomKind::Utf32Le)
        ));
    }

    #[test]
    fn bom_utf32_be() {
        assert!(matches!(
            detect_bom(&[0x00, 0x00, 0xFE, 0xFF]),
            Some(BomKind::Utf32Be)
        ));
    }

    #[test]
    fn bom_none() {
        assert!(detect_bom(b"hello world").is_none());
    }

    // ── is_text_content ─────────────────────────────────────────────────────

    #[test]
    fn text_content_valid_utf8() {
        assert!(is_text_content(b"hello, world!\n"));
    }

    #[test]
    fn text_content_null_byte_is_binary() {
        assert!(!is_text_content(b"hello\x00world"));
    }

    #[test]
    fn text_content_high_control_bytes_is_binary() {
        assert!(!is_text_content(&[
            0x80, 0x81, 0x82, 0x83, 0x01, 0x02, 0x03, 0x04
        ]));
    }

    // ── extract_interpreter ──────────────────────────────────────────────────

    #[test]
    fn interpreter_env() {
        assert_eq!(
            extract_interpreter(b"#!/usr/bin/env python3\ncode"),
            "/usr/bin/env"
        );
    }

    #[test]
    fn interpreter_direct() {
        assert_eq!(extract_interpreter(b"#!/bin/bash\n"), "/bin/bash");
    }

    #[test]
    fn interpreter_with_flag_strips_args() {
        assert_eq!(
            extract_interpreter(b"#!/usr/bin/perl -w\n"),
            "/usr/bin/perl"
        );
    }

    // ── classify_mime ────────────────────────────────────────────────────────

    #[test]
    fn classify_pdf() {
        assert!(matches!(
            classify_mime("application/pdf", b"%PDF-1.4"),
            DetectResult::Pdf
        ));
    }

    #[test]
    fn classify_postscript_with_content() {
        assert!(matches!(
            classify_mime("application/postscript", b"%!PS-Adobe-3.0\nextra"),
            DetectResult::PostScript
        ));
    }

    #[test]
    fn classify_tex_magic_comment_is_text() {
        // %!TEX shares the %! prefix with PostScript but is not PostScript
        let buf = b"%!TEX TS-program = Arara\n\\documentclass{article}";
        assert!(matches!(
            classify_mime("application/postscript", buf),
            DetectResult::Text(_)
        ));
    }

    #[test]
    fn classify_postscript_stub_reclassified_as_text() {
        // exactly 2 bytes — only the signature, no content
        assert!(matches!(
            classify_mime("application/postscript", b"%!"),
            DetectResult::Text(_)
        ));
    }

    #[test]
    fn classify_zip() {
        assert!(matches!(
            classify_mime("application/zip", b"PK\x03\x04data"),
            DetectResult::Zip
        ));
    }

    #[test]
    fn classify_gzip_archive() {
        assert!(matches!(
            classify_mime("application/gzip", b"\x1f\x8bdata"),
            DetectResult::Archive
        ));
    }

    #[test]
    fn classify_png_with_content() {
        let mut buf = vec![0x89, b'P', b'N', b'G', b'\r', b'\n', 0x1a, b'\n'];
        buf.extend_from_slice(b"IHDR_data");
        assert!(matches!(
            classify_mime("image/png", &buf),
            DetectResult::Png
        ));
    }

    #[test]
    fn classify_png_stub_reclassified_as_unknown_binary() {
        // exactly the 8-byte PNG signature, no content
        let buf = [0x89, b'P', b'N', b'G', b'\r', b'\n', 0x1a, b'\n'];
        assert!(matches!(
            classify_mime("image/png", &buf),
            DetectResult::UnknownBinary
        ));
    }

    #[test]
    fn classify_elf_executable() {
        assert!(matches!(
            classify_mime("application/x-executable", b"\x7fELF"),
            DetectResult::Elf
        ));
    }

    #[test]
    fn classify_shellscript_produces_script_variant() {
        let buf = b"#!/bin/bash\necho hi\n";
        assert!(matches!(
            classify_mime("text/x-shellscript", buf),
            DetectResult::Script(_, _)
        ));
    }

    #[test]
    fn classify_unknown_mime_preserved() {
        assert!(matches!(
            classify_mime("application/octet-stream", b"\x00\x01\x02"),
            DetectResult::Mime(ref m) if m == "application/octet-stream"
        ));
    }

    // ── detect() — integration tests with real files ──────────────────────

    #[test]
    fn detect_empty_file() {
        let dir = tempfile::tempdir().unwrap();
        let path = dir.path().join("empty");
        fs::write(&path, b"").unwrap();
        assert!(matches!(detect(&path), Ok(DetectResult::Zerofile)));
    }

    #[test]
    fn detect_text_lf() {
        let dir = tempfile::tempdir().unwrap();
        let path = dir.path().join("text.txt");
        fs::write(&path, b"hello\nworld\n").unwrap();
        assert!(matches!(
            detect(&path),
            Ok(DetectResult::Text(LineEnding::Lf))
        ));
    }

    #[test]
    fn detect_shebang_script() {
        let dir = tempfile::tempdir().unwrap();
        let path = dir.path().join("script.py");
        fs::write(&path, b"#!/usr/bin/env python3\nprint('hi')\n").unwrap();
        match detect(&path).unwrap() {
            DetectResult::Script(LineEnding::Lf, interp) => {
                assert_eq!(interp, "/usr/bin/env");
            }
            other => panic!("expected Script, got {other}"),
        }
    }

    #[test]
    fn detect_utf8_bom() {
        let dir = tempfile::tempdir().unwrap();
        let path = dir.path().join("bom.txt");
        fs::write(&path, b"\xEF\xBB\xBFhello").unwrap();
        assert!(matches!(
            detect(&path),
            Ok(DetectResult::Bom(BomKind::Utf8))
        ));
    }

    #[test]
    fn detect_directory() {
        let dir = tempfile::tempdir().unwrap();
        assert!(matches!(detect(dir.path()), Ok(DetectResult::Directory)));
    }

    #[test]
    fn detect_broken_symlink() {
        let dir = tempfile::tempdir().unwrap();
        let link = dir.path().join("broken");
        symlink("/nonexistent_xyz", &link).unwrap();
        assert!(matches!(detect(&link), Ok(DetectResult::BrokenSymlink)));
    }

    fn make_v7_tar() -> Vec<u8> {
        let mut h = vec![0u8; 512];
        h[..8].copy_from_slice(b"test.txt");
        h[100..108].copy_from_slice(b"0000644\0");
        h[108..116].copy_from_slice(b"0000000\0");
        h[116..124].copy_from_slice(b"0000000\0");
        h[124..136].copy_from_slice(b"00000000000\0");
        h[136..148].copy_from_slice(b"00000000000\0");
        h[156] = b'0';
        let sum: u32 = h
            .iter()
            .enumerate()
            .map(|(i, &b)| {
                if (148..156).contains(&i) {
                    0x20
                } else {
                    b as u32
                }
            })
            .sum();
        let s = format!("{sum:07o}\0");
        h[148..156].copy_from_slice(s.as_bytes());
        h
    }

    #[test]
    fn detect_v7_tar_is_archive() {
        let dir = tempfile::tempdir().unwrap();
        let path = dir.path().join("archive.tar");
        fs::write(&path, &make_v7_tar()).unwrap();
        assert!(matches!(detect(&path), Ok(DetectResult::Archive)));
    }

    #[test]
    fn detect_dos_eps_is_postscript() {
        let dir = tempfile::tempdir().unwrap();
        let path = dir.path().join("figure.eps");
        // DOS EPS magic followed by dummy content
        let mut buf = vec![0xC5, 0xD0, 0xD3, 0xC6];
        buf.extend_from_slice(b"\x00\x00\x00\x00padding data");
        fs::write(&path, &buf).unwrap();
        assert!(matches!(detect(&path), Ok(DetectResult::PostScript)));
    }

    #[test]
    fn detect_lzma_old_format_is_archive() {
        let dir = tempfile::tempdir().unwrap();
        let path = dir.path().join("data.lzma");
        // Properties byte 0x5D (93 ≤ 224) + dict size 0x00800000 (8 MiB, power of two in LE)
        let buf = [
            0x5D, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        ];
        fs::write(&path, &buf).unwrap();
        assert!(matches!(detect(&path), Ok(DetectResult::Archive)));
    }

    #[test]
    fn detect_valid_symlink_is_skipped() {
        let dir = tempfile::tempdir().unwrap();
        let target = dir.path().join("target.txt");
        fs::write(&target, b"content").unwrap();
        let link = dir.path().join("link");
        symlink(&target, &link).unwrap();
        assert!(matches!(detect(&link), Ok(DetectResult::Skipped)));
    }
}