diff --git a/Cargo.lock b/Cargo.lock index 9aa7253..0ba0e7e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7,6 +7,7 @@ name = "DnfUtils" version = "0.1.0" dependencies = [ "bytes", + "chardetng", "chrono", "colored", "cxx", @@ -162,6 +163,17 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" +[[package]] +name = "chardetng" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14b8f0b65b7b08ae3c8187e8d77174de20cb6777864c6b832d8ad365999cf1ea" +dependencies = [ + "cfg-if", + "encoding_rs", + "memchr", +] + [[package]] name = "chrono" version = "0.4.43" diff --git a/Cargo.toml b/Cargo.toml index 832a689..bb8268a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,6 +32,7 @@ prost = "0.14.3" tokio-tungstenite = "0.28.0" bytes = "1.11.0" spin = "0.10.0" +chardetng = "0.1.17" [build-dependencies] cxx-build = "1.0.192" diff --git a/src/lib.rs b/src/lib.rs index 1620913..b20ec0c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -7,11 +7,11 @@ use encoding_rs::Encoding; use futures_util::SinkExt; use prost::Message as WebMessage; use spin::Mutex; +use std::sync::OnceLock; use std::sync::mpsc; use tokio::net::TcpStream; use tokio::runtime::Runtime; use tokio_tungstenite::tungstenite::Message; -use std::sync::OnceLock; static LOGGER_SENDER: OnceLock> = OnceLock::new(); @@ -28,6 +28,15 @@ mod ffi { fn http_get(url: &CxxString) -> Result; + /// 尝试将未知编码的字符串转换为UTF-8 + fn to_utf8(string: &CxxString) -> String; + + /// UTF-16LE转换为UTF-8 + fn unicode_to_utf_8(string: &CxxString) -> String; + + /// 猜测编码 + fn guess_encoding(string: &CxxString) -> String; + fn init_log(is_debug: bool, ws_uel: &CxxString); fn log_trace(msg: &CxxString); @@ -42,6 +51,28 @@ mod ffi { } } +fn unicode_to_utf_8(string: &CxxString) -> String { + // 约定:传入的内容按 UTF-16LE 字节序解释 + let mut bytes = string.as_bytes(); + + // UTF-16 必须是偶数长度;若是奇数长度,丢弃最后 1 个字节避免越界/误解码 + if (bytes.len() & 1) == 1 { + bytes = &bytes[..bytes.len() - 1]; + } + + // 用 encoding_rs 直接把 UTF-16LE 转成 Rust 的 UTF-8 String + let (cow, _actual_used, _had_errors) = encoding_rs::UTF_16LE.decode(bytes); + cow.into_owned() +} + +fn guess_encoding(string: &CxxString) -> String { + utils::guess_encoding_label(string) +} + +fn to_utf8(string: &CxxString) -> String { + cxx_string_to_string(string).to_string() +} + fn log_error(msg: &cxx::CxxString) { let msg = cxx_string_to_string(msg); let lock = LOGGER_SENDER.get().unwrap(); diff --git a/src/utils.rs b/src/utils.rs index 5774156..a841828 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -1,5 +1,13 @@ +use chardetng::EncodingDetector; use encoding_rs::Encoding; +pub fn guess_encoding_label(s: &cxx::CxxString) -> String { + let mut det = EncodingDetector::new(); + det.feed(s.as_bytes(), true); + let enc = det.guess(None, true); + enc.name().to_string() +} + pub fn cxx_string_to_string(s: &cxx::CxxString) -> String { match s.to_str() { Ok(s) => return s.to_string(),