在cxx_string_to_string中针对UTF-8混合编码特殊处理，舍弃无法作为utf-8解析的部分

2026-01-29 21:58:05 +08:00
parent b5464925cf
commit 94e81cbfd6
1 changed files with 60 additions and 1 deletions
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -6,6 +6,11 @@ pub fn cxx_string_to_string(s: &cxx::CxxString) -> String {
        Err(_) => {}
    };

+    //  如果找到了utf-8码点，则它可能是UTF-8混合编码，那么，舍弃无法作为UTF-8解析的部分。
+    if contains_non_ascii_utf8_codepoint(s.as_bytes()) {
+        return s.to_string_lossy().into_owned();
+    }
+
    // 不是UTF-8，尝试转换
    let candidates = [
        "gb18030",      // 覆盖 GBK/GB2312 的常见场景
@@ -27,3 +32,57 @@ pub fn cxx_string_to_string(s: &cxx::CxxString) -> String {
    // 编码全部没有命中的话，则丢弃无法解析的部分
    s.to_string_lossy().into_owned()
 }
+
+/// 检测 bytes 中是否存在“严格合法的 UTF-8 非 ASCII 码点”
+///
+/// - ASCII 不算（0x00..=0x7F）
+/// - 只要发现任意一个合法的 2/3/4 字节 UTF-8 码点，就返回 true
+pub fn contains_non_ascii_utf8_codepoint(bytes: &[u8]) -> bool {
+    let mut i = 0usize;
+
+    while i < bytes.len() {
+        let b0 = bytes[i];
+
+        // ASCII：跳过，不算
+        if b0 < 0x80 {
+            i += 1;
+            continue;
+        }
+
+        // 计算可能的 UTF-8 序列长度（排除非法首字节、过长范围等）
+        let len = if (0xC2..=0xDF).contains(&b0) {
+            2
+        } else if (0xE0..=0xEF).contains(&b0) {
+            3
+        } else if (0xF0..=0xF4).contains(&b0) {
+            4
+        } else {
+            // 续字节(0x80..0xBF) 或非法首字节：不可能作为起点
+            i += 1;
+            continue;
+        };
+
+        // 长度不够：结束
+        if i + len > bytes.len() {
+            return false;
+        }
+
+        // 快速检查：后续字节必须是 0x80..=0xBF
+        let cont_ok = bytes[i + 1..i + len]
+            .iter()
+            .all(|&b| (0x80..=0xBF).contains(&b));
+        if !cont_ok {
+            i += 1;
+            continue;
+        }
+
+        // 严格 UTF-8 校验（防止过长编码、代理项等）
+        if std::str::from_utf8(&bytes[i..i + len]).is_ok() {
+            return true; // 找到一个合法的非 ASCII UTF-8 码点
+        }
+
+        i += 1;
+    }
+
+    false
+}