summaly/built/utils/encoding.js
2023-02-12 12:20:19 +00:00

41 lines
1.1 KiB
JavaScript

import iconv from 'iconv-lite';
import jschardet from 'jschardet';
const regCharset = new RegExp(/charset\s*=\s*["']?([\w-]+)/, 'i');
/**
* Detect HTML encoding
* @param body Body in Buffer
* @returns encoding
*/
export function detectEncoding(body) {
// By detection
const detected = jschardet.detect(body, { minimumThreshold: 0.99 });
if (detected) {
const candicate = detected.encoding;
const encoding = toEncoding(candicate);
if (encoding != null)
return encoding;
}
// From meta
const matchMeta = body.toString('ascii').match(regCharset);
if (matchMeta) {
const candicate = matchMeta[1];
const encoding = toEncoding(candicate);
if (encoding != null)
return encoding;
}
return 'utf-8';
}
export function toUtf8(body, encoding) {
return iconv.decode(body, encoding);
}
function toEncoding(candicate) {
if (iconv.encodingExists(candicate)) {
if (['shift_jis', 'shift-jis', 'windows-31j', 'x-sjis'].includes(candicate.toLowerCase()))
return 'cp932';
return candicate;
}
else {
return null;
}
}