From 94e81cbfd68bb433615df792270e16f500a7949d Mon Sep 17 00:00:00 2001 From: Starpoles Date: Thu, 29 Jan 2026 21:58:05 +0800 Subject: [PATCH] =?UTF-8?q?=E5=9C=A8cxx=5Fstring=5Fto=5Fstring=E4=B8=AD?= =?UTF-8?q?=E9=92=88=E5=AF=B9UTF-8=E6=B7=B7=E5=90=88=E7=BC=96=E7=A0=81?= =?UTF-8?q?=E7=89=B9=E6=AE=8A=E5=A4=84=E7=90=86=EF=BC=8C=E8=88=8D=E5=BC=83?= =?UTF-8?q?=E6=97=A0=E6=B3=95=E4=BD=9C=E4=B8=BAutf-8=E8=A7=A3=E6=9E=90?= =?UTF-8?q?=E7=9A=84=E9=83=A8=E5=88=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/utils.rs | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 60 insertions(+), 1 deletion(-) diff --git a/src/utils.rs b/src/utils.rs index 8e1e3d6..5774156 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -6,6 +6,11 @@ pub fn cxx_string_to_string(s: &cxx::CxxString) -> String { Err(_) => {} }; + // 如果找到了utf-8码点,则它可能是UTF-8混合编码,那么,舍弃无法作为UTF-8解析的部分。 + if contains_non_ascii_utf8_codepoint(s.as_bytes()) { + return s.to_string_lossy().into_owned(); + } + // 不是UTF-8,尝试转换 let candidates = [ "gb18030", // 覆盖 GBK/GB2312 的常见场景 @@ -26,4 +31,58 @@ pub fn cxx_string_to_string(s: &cxx::CxxString) -> String { // 编码全部没有命中的话,则丢弃无法解析的部分 s.to_string_lossy().into_owned() -} \ No newline at end of file +} + +/// 检测 bytes 中是否存在“严格合法的 UTF-8 非 ASCII 码点” +/// +/// - ASCII 不算(0x00..=0x7F) +/// - 只要发现任意一个合法的 2/3/4 字节 UTF-8 码点,就返回 true +pub fn contains_non_ascii_utf8_codepoint(bytes: &[u8]) -> bool { + let mut i = 0usize; + + while i < bytes.len() { + let b0 = bytes[i]; + + // ASCII:跳过,不算 + if b0 < 0x80 { + i += 1; + continue; + } + + // 计算可能的 UTF-8 序列长度(排除非法首字节、过长范围等) + let len = if (0xC2..=0xDF).contains(&b0) { + 2 + } else if (0xE0..=0xEF).contains(&b0) { + 3 + } else if (0xF0..=0xF4).contains(&b0) { + 4 + } else { + // 续字节(0x80..0xBF) 或非法首字节:不可能作为起点 + i += 1; + continue; + }; + + // 长度不够:结束 + if i + len > bytes.len() { + return false; + } + + // 快速检查:后续字节必须是 0x80..=0xBF + let cont_ok = bytes[i + 1..i + len] + .iter() + .all(|&b| (0x80..=0xBF).contains(&b)); + if !cont_ok { + i += 1; + continue; + } + + // 严格 UTF-8 校验(防止过长编码、代理项等) + if std::str::from_utf8(&bytes[i..i + len]).is_ok() { + return true; // 找到一个合法的非 ASCII UTF-8 码点 + } + + i += 1; + } + + false +}