//! Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
//! with the help of Claude.ai.
//!
//! MIT licensed.

#![cfg(any(target_arch = "x86_64", target_arch = "x86"))]

#[cfg(target_arch = "x86")]
use core::arch::x86::*;

#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;

use crate::fusion::x86::*;

/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
/// with the help of Claude.ai using:
///
/// ./generate -i sse -p crc32c -a v4s3x3
///
/// Modified as necessary for this Rust implementation.
#[inline]
#[target_feature(enable = "sse4.2,pclmulqdq")]
pub unsafe fn crc32_iscsi_sse_v4s3x3(mut crc0: u32, mut buf: *const u8, mut len: usize) -> u32 {
    // Align to 8-byte boundary using hardware CRC32C instructions
    while len > 0 && (buf as usize & 7) != 0 {
        crc0 = _mm_crc32_u8(crc0, *buf);
        buf = buf.add(1);
        len -= 1;
    }

    // Handle 8-byte alignment
    if (buf as usize & 8) != 0 && len >= 8 {
        crc0 = mm_crc32_u64(crc0, *(buf as *const u64));
        buf = buf.add(8);
        len -= 8;
    }

    if len >= 144 {
        let blk = (len - 8) / 136;
        let klen = blk * 24;
        let buf2 = buf;
        let mut crc1 = 0u32;
        let mut crc2 = 0u32;

        // First vector chunk - load four 128-bit vectors (64 bytes total)
        let mut x0 = _mm_loadu_si128(buf2 as *const __m128i);
        let mut x1 = _mm_loadu_si128(buf2.add(16) as *const __m128i);
        let mut x2 = _mm_loadu_si128(buf2.add(32) as *const __m128i);
        let mut x3 = _mm_loadu_si128(buf2.add(48) as *const __m128i);

        // iSCSI-specific folding constant (same as AVX-512 version)
        let mut k = _mm_setr_epi32(0x740eef02u32 as i32, 0, 0x9e4addf8u32 as i32, 0);

        // XOR the CRC into the first vector's low 32 bits
        x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0 as i32), x0);
        crc0 = 0;

        let mut buf2 = buf2.add(64);
        len -= 136;
        buf = buf.add(blk * 64);

        // Main loop - process 144 bytes at a time
        while len >= 144 {
            let mut y0 = clmul_lo_sse(x0, k);
            x0 = clmul_hi_sse(x0, k);
            let mut y1 = clmul_lo_sse(x1, k);
            x1 = clmul_hi_sse(x1, k);
            let mut y2 = clmul_lo_sse(x2, k);
            x2 = clmul_hi_sse(x2, k);
            let mut y3 = clmul_lo_sse(x3, k);
            x3 = clmul_hi_sse(x3, k);

            // XOR operations using separate XOR instructions (no ternary logic in SSE)
            y0 = _mm_xor_si128(y0, _mm_loadu_si128(buf2 as *const __m128i));
            x0 = _mm_xor_si128(x0, y0);
            y1 = _mm_xor_si128(y1, _mm_loadu_si128(buf2.add(16) as *const __m128i));
            x1 = _mm_xor_si128(x1, y1);
            y2 = _mm_xor_si128(y2, _mm_loadu_si128(buf2.add(32) as *const __m128i));
            x2 = _mm_xor_si128(x2, y2);
            y3 = _mm_xor_si128(y3, _mm_loadu_si128(buf2.add(48) as *const __m128i));
            x3 = _mm_xor_si128(x3, y3);

            // Process scalar data in parallel using hardware CRC32C
            crc0 = mm_crc32_u64(crc0, *(buf as *const u64));
            crc1 = mm_crc32_u64(crc1, *(buf.add(klen) as *const u64));
            crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2) as *const u64));
            crc0 = mm_crc32_u64(crc0, *(buf.add(8) as *const u64));
            crc1 = mm_crc32_u64(crc1, *(buf.add(klen + 8) as *const u64));
            crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2 + 8) as *const u64));
            crc0 = mm_crc32_u64(crc0, *(buf.add(16) as *const u64));
            crc1 = mm_crc32_u64(crc1, *(buf.add(klen + 16) as *const u64));
            crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2 + 16) as *const u64));

            buf = buf.add(24);
            buf2 = buf2.add(64);
            len -= 136;
        }

        // Reduce x0 ... x3 to just x0 using iSCSI-specific constants
        k = _mm_setr_epi32(0xf20c0dfeu32 as i32, 0, 0x493c7d27u32 as i32, 0);

        let mut y0 = clmul_lo_sse(x0, k);
        x0 = clmul_hi_sse(x0, k);
        let mut y2 = clmul_lo_sse(x2, k);
        x2 = clmul_hi_sse(x2, k);

        y0 = _mm_xor_si128(y0, x1);
        x0 = _mm_xor_si128(x0, y0);
        y2 = _mm_xor_si128(y2, x3);
        x2 = _mm_xor_si128(x2, y2);

        k = _mm_setr_epi32(0x3da6d0cbu32 as i32, 0, 0xba4fc28eu32 as i32, 0);

        y0 = clmul_lo_sse(x0, k);
        x0 = clmul_hi_sse(x0, k);
        y0 = _mm_xor_si128(y0, x2);
        x0 = _mm_xor_si128(x0, y0);

        // Final scalar chunk
        crc0 = mm_crc32_u64(crc0, *(buf as *const u64));
        crc1 = mm_crc32_u64(crc1, *(buf.add(klen) as *const u64));
        crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2) as *const u64));
        crc0 = mm_crc32_u64(crc0, *(buf.add(8) as *const u64));
        crc1 = mm_crc32_u64(crc1, *(buf.add(klen + 8) as *const u64));
        crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2 + 8) as *const u64));
        crc0 = mm_crc32_u64(crc0, *(buf.add(16) as *const u64));
        crc1 = mm_crc32_u64(crc1, *(buf.add(klen + 16) as *const u64));
        crc2 = mm_crc32_u64(crc2, *(buf.add(klen * 2 + 16) as *const u64));
        buf = buf.add(24);

        let vc0 = crc_shift_iscsi_sse(crc0, klen * 2 + 8);
        let vc1 = crc_shift_iscsi_sse(crc1, klen + 8);
        let mut vc = mm_extract_epi64(_mm_xor_si128(vc0, vc1), 0);

        // Reduce 128 bits to 32 bits, and multiply by x^32
        // Extract the two 64-bit parts of x0 and combine them
        let x0_low = mm_extract_epi64(x0, 0);
        let x0_high = mm_extract_epi64(x0, 1);
        let x0_combined = mm_extract_epi64(
            crc_shift_iscsi_sse(mm_crc32_u64(mm_crc32_u64(0, x0_low), x0_high), klen * 3 + 8),
            0,
        );
        vc ^= x0_combined;

        // Final 8 bytes
        buf = buf.add(klen * 2);
        crc0 = crc2;
        crc0 = mm_crc32_u64(crc0, *(buf as *const u64) ^ vc);
        buf = buf.add(8);
        len -= 8;
    }

    // Process remaining 8-byte chunks using hardware CRC32C
    while len >= 8 {
        crc0 = mm_crc32_u64(crc0, *(buf as *const u64));
        buf = buf.add(8);
        len -= 8;
    }

    // Process remaining bytes using hardware CRC32C
    while len > 0 {
        crc0 = _mm_crc32_u8(crc0, *buf);
        buf = buf.add(1);
        len -= 1;
    }

    crc0
}
