fix: support non-standard encodings

This commit is contained in:
Acid Chicken (硫酸鶏) 2023-11-03 18:15:38 +09:00
parent d07f170d09
commit c6a859dff3
No known key found for this signature in database
GPG Key ID: 3E87B98A3F6BAB99
4 changed files with 106 additions and 2 deletions

View File

@ -2,12 +2,15 @@
"packageManager": "pnpm@8.3.1",
"devDependencies": {
"@cloudflare/workers-types": "^4.20230511.0",
"@types/whatwg-mimetype": "^3.0.1",
"vitest": "^0.31.0",
"wrangler": "^2.20.0"
},
"dependencies": {
"@zxing/text-encoding": "^0.9.0",
"hono": "^3.1.8",
"html-entities": "^2.3.3",
"summaly": "^2.7.0"
"summaly": "^2.7.0",
"whatwg-mimetype": "^3.0.0"
}
}

22
pnpm-lock.yaml generated
View File

@ -1,6 +1,9 @@
lockfileVersion: '6.0'
dependencies:
'@zxing/text-encoding':
specifier: ^0.9.0
version: 0.9.0
hono:
specifier: ^3.1.8
version: 3.1.8
@ -10,11 +13,17 @@ dependencies:
summaly:
specifier: ^2.7.0
version: 2.7.0
whatwg-mimetype:
specifier: ^3.0.0
version: 3.0.0
devDependencies:
'@cloudflare/workers-types':
specifier: ^4.20230511.0
version: 4.20230511.0
'@types/whatwg-mimetype':
specifier: ^3.0.1
version: 3.0.1
vitest:
specifier: ^0.31.0
version: 0.31.0
@ -687,6 +696,10 @@ packages:
resolution: {integrity: sha512-TgfOX+mGY/NyNxJLIbDWrO9DjGoVSW9+aB8H2yy1fy32jsvxijhmyJI9fDFgvz3YP4lvJaq9DzdR/M1bOgVc9g==}
dev: true
/@types/whatwg-mimetype@3.0.1:
resolution: {integrity: sha512-dy1Os16KVaOyiwS237oURk7v0IGezg06FUlW//WcoAJMqrMSQlHNiYmZz06MgXdqE/uARd9h+sOm4AWRsJvUnQ==}
dev: true
/@vitest/expect@0.31.0:
resolution: {integrity: sha512-Jlm8ZTyp6vMY9iz9Ny9a0BHnCG4fqBa8neCF6Pk/c/6vkUk49Ls6UBlgGAU82QnzzoaUs9E/mUhq/eq9uMOv/g==}
dependencies:
@ -726,6 +739,10 @@ packages:
pretty-format: 27.5.1
dev: true
/@zxing/text-encoding@0.9.0:
resolution: {integrity: sha512-U/4aVJ2mxI0aDNI8Uq0wEhMgY+u4CNtEb0om3+y3+niDAsoTCOB33UF0sxpzqzdqXLqmvc+vZyAt4O8pPdfkwA==}
dev: false
/accepts@1.3.8:
resolution: {integrity: sha512-PYAthTa2m2VKxuvSD3DPC/Gy+U+sOA1LAuT8mkmRuvw+NACSaeXEQ+NHcVF7rONl6qcaxV3Uuemwawk+7+SJLw==}
engines: {node: '>= 0.6'}
@ -2263,6 +2280,11 @@ packages:
engines: {node: '>=6'}
dev: true
/whatwg-mimetype@3.0.0:
resolution: {integrity: sha512-nt+N2dzIutVRxARx1nghPKGv1xHikU7HKdfafKkLNLindmPU/ch3U31NOCGGA/dmPcmb1VlofO0vnKAcsm0o/Q==}
engines: {node: '>=12'}
dev: false
/which@2.0.2:
resolution: {integrity: sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA==}
engines: {node: '>= 8'}

69
src/encoding.ts Normal file
View File

@ -0,0 +1,69 @@
// due to the bug in the Cloudflare Workers runtime, we have to use @zxing/text-encoding instead of the built-in TextEncoder/TextDecoder.
import { encodingIndexes } from "@zxing/text-encoding/esm/encoding-indexes";
(globalThis as any).TextEncodingIndexes = { encodingIndexes };
import { TextDecoder, TextEncoder } from "@zxing/text-encoding";
import MIMEType from "whatwg-mimetype";
function getCharsetFromHeader(response: Response): string | null {
const contentType = response.headers.get("Content-Type");
if (contentType === null) {
return null;
}
try {
const mimeType = new MIMEType(contentType);
return mimeType.parameters.get("charset") ?? null;
} catch {
return null;
}
}
async function getCharsetFromBody(response: Response): Promise<string | null> {
let charset: string | null = null;
const rewriter = new HTMLRewriter();
rewriter.on("meta", {
element(element) {
const httpEquiv = element.getAttribute("http-equiv");
if (
charset === null &&
httpEquiv !== null &&
httpEquiv.toLowerCase() === "content-type"
) {
const content = element.getAttribute("content");
if (content !== null) {
try {
const mimeType = new MIMEType(content);
charset = mimeType.parameters.get("charset") ?? null;
} catch {}
}
}
const charsetAttr = element.getAttribute("charset");
if (charsetAttr !== null) {
charset = charsetAttr;
}
},
});
const reader = rewriter.transform(response).body!.getReader();
while (!(await reader.read()).done);
return charset;
}
export async function getNormalizer(
response: Response
): Promise<TransformStream<Uint8Array, Uint8Array>> {
const charset =
getCharsetFromHeader(response) ?? (await getCharsetFromBody(response));
if (charset === null || charset.toLowerCase() === "utf-8") {
return new TransformStream();
}
const decoder = new TextDecoder(charset, { fatal: true, ignoreBOM: true });
const encoder = new TextEncoder();
const transform = new TransformStream<Uint8Array, Uint8Array>({
transform(chunk, controller) {
controller.enqueue(
encoder.encode(decoder.decode(chunk, { stream: true }))
);
},
});
return transform;
}

View File

@ -1,5 +1,6 @@
import { Hono } from "hono";
import summary from "./summary";
import { getNormalizer } from "./encoding";
export interface Env {
// Example binding to KV. Learn more at https://developers.cloudflare.com/workers/runtime-apis/kv/
@ -17,6 +18,11 @@ export interface Env {
const app = new Hono<Env>();
app.onError((error, context) => {
console.error(error);
return context.json({ error: error.message, stack: error.stack }, 500);
});
app.get("/url", async (context) => {
let url: URL;
try {
@ -26,9 +32,13 @@ app.get("/url", async (context) => {
}
const response = await fetch(url);
url = new URL(response.url);
const [left, right] = response.body!.tee();
const normalizer = await getNormalizer(new Response(left, response));
const rewriter = new HTMLRewriter();
const summarized = summary(url, rewriter);
const reader = rewriter.transform(response).body!.getReader();
const reader = rewriter
.transform(new Response(right.pipeThrough(normalizer), response))
.body!.getReader();
while (!(await reader.read()).done);
return context.json(await summarized);
});