From 94e81cbfd68bb433615df792270e16f500a7949d Mon Sep 17 00:00:00 2001
From: Starpoles <ling@noreply.lingapi.com>
Date: Thu, 29 Jan 2026 21:58:05 +0800
Subject: [PATCH] =?UTF-8?q?=E5=9C=A8cxx=5Fstring=5Fto=5Fstring=E4=B8=AD?=
 =?UTF-8?q?=E9=92=88=E5=AF=B9UTF-8=E6=B7=B7=E5=90=88=E7=BC=96=E7=A0=81?=
 =?UTF-8?q?=E7=89=B9=E6=AE=8A=E5=A4=84=E7=90=86=EF=BC=8C=E8=88=8D=E5=BC=83?=
 =?UTF-8?q?=E6=97=A0=E6=B3=95=E4=BD=9C=E4=B8=BAutf-8=E8=A7=A3=E6=9E=90?=
 =?UTF-8?q?=E7=9A=84=E9=83=A8=E5=88=86?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/utils.rs | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 60 insertions(+), 1 deletion(-)

diff --git a/src/utils.rs b/src/utils.rs
index 8e1e3d6..5774156 100644
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -6,6 +6,11 @@ pub fn cxx_string_to_string(s: &cxx::CxxString) -> String {
         Err(_) => {}
     };
 
+    //  如果找到了utf-8码点，则它可能是UTF-8混合编码，那么，舍弃无法作为UTF-8解析的部分。
+    if contains_non_ascii_utf8_codepoint(s.as_bytes()) {
+        return s.to_string_lossy().into_owned();
+    }
+
     // 不是UTF-8，尝试转换
     let candidates = [
         "gb18030",      // 覆盖 GBK/GB2312 的常见场景
@@ -26,4 +31,58 @@ pub fn cxx_string_to_string(s: &cxx::CxxString) -> String {
 
     // 编码全部没有命中的话，则丢弃无法解析的部分
     s.to_string_lossy().into_owned()
-}
\ No newline at end of file
+}
+
+/// 检测 bytes 中是否存在“严格合法的 UTF-8 非 ASCII 码点”
+///
+/// - ASCII 不算（0x00..=0x7F）
+/// - 只要发现任意一个合法的 2/3/4 字节 UTF-8 码点，就返回 true
+pub fn contains_non_ascii_utf8_codepoint(bytes: &[u8]) -> bool {
+    let mut i = 0usize;
+
+    while i < bytes.len() {
+        let b0 = bytes[i];
+
+        // ASCII：跳过，不算
+        if b0 < 0x80 {
+            i += 1;
+            continue;
+        }
+
+        // 计算可能的 UTF-8 序列长度（排除非法首字节、过长范围等）
+        let len = if (0xC2..=0xDF).contains(&b0) {
+            2
+        } else if (0xE0..=0xEF).contains(&b0) {
+            3
+        } else if (0xF0..=0xF4).contains(&b0) {
+            4
+        } else {
+            // 续字节(0x80..0xBF) 或非法首字节：不可能作为起点
+            i += 1;
+            continue;
+        };
+
+        // 长度不够：结束
+        if i + len > bytes.len() {
+            return false;
+        }
+
+        // 快速检查：后续字节必须是 0x80..=0xBF
+        let cont_ok = bytes[i + 1..i + len]
+            .iter()
+            .all(|&b| (0x80..=0xBF).contains(&b));
+        if !cont_ok {
+            i += 1;
+            continue;
+        }
+
+        // 严格 UTF-8 校验（防止过长编码、代理项等）
+        if std::str::from_utf8(&bytes[i..i + len]).is_ok() {
+            return true; // 找到一个合法的非 ASCII UTF-8 码点
+        }
+
+        i += 1;
+    }
+
+    false
+}