From c6a859dff3d278fb53e0598647ca8c8affa4fec5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Acid=20Chicken=20=28=E7=A1=AB=E9=85=B8=E9=B6=8F=29?= Date: Fri, 3 Nov 2023 18:15:38 +0900 Subject: [PATCH] fix: support non-standard encodings --- package.json | 5 +++- pnpm-lock.yaml | 22 ++++++++++++++++ src/encoding.ts | 69 +++++++++++++++++++++++++++++++++++++++++++++++++ src/index.ts | 12 ++++++++- 4 files changed, 106 insertions(+), 2 deletions(-) create mode 100644 src/encoding.ts diff --git a/package.json b/package.json index e9b1fd2..054f9c6 100644 --- a/package.json +++ b/package.json @@ -2,12 +2,15 @@ "packageManager": "pnpm@8.3.1", "devDependencies": { "@cloudflare/workers-types": "^4.20230511.0", + "@types/whatwg-mimetype": "^3.0.1", "vitest": "^0.31.0", "wrangler": "^2.20.0" }, "dependencies": { + "@zxing/text-encoding": "^0.9.0", "hono": "^3.1.8", "html-entities": "^2.3.3", - "summaly": "^2.7.0" + "summaly": "^2.7.0", + "whatwg-mimetype": "^3.0.0" } } diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index d558697..168dbfc 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -1,6 +1,9 @@ lockfileVersion: '6.0' dependencies: + '@zxing/text-encoding': + specifier: ^0.9.0 + version: 0.9.0 hono: specifier: ^3.1.8 version: 3.1.8 @@ -10,11 +13,17 @@ dependencies: summaly: specifier: ^2.7.0 version: 2.7.0 + whatwg-mimetype: + specifier: ^3.0.0 + version: 3.0.0 devDependencies: '@cloudflare/workers-types': specifier: ^4.20230511.0 version: 4.20230511.0 + '@types/whatwg-mimetype': + specifier: ^3.0.1 + version: 3.0.1 vitest: specifier: ^0.31.0 version: 0.31.0 @@ -687,6 +696,10 @@ packages: resolution: {integrity: sha512-TgfOX+mGY/NyNxJLIbDWrO9DjGoVSW9+aB8H2yy1fy32jsvxijhmyJI9fDFgvz3YP4lvJaq9DzdR/M1bOgVc9g==} dev: true + /@types/whatwg-mimetype@3.0.1: + resolution: {integrity: sha512-dy1Os16KVaOyiwS237oURk7v0IGezg06FUlW//WcoAJMqrMSQlHNiYmZz06MgXdqE/uARd9h+sOm4AWRsJvUnQ==} + dev: true + /@vitest/expect@0.31.0: resolution: {integrity: sha512-Jlm8ZTyp6vMY9iz9Ny9a0BHnCG4fqBa8neCF6Pk/c/6vkUk49Ls6UBlgGAU82QnzzoaUs9E/mUhq/eq9uMOv/g==} dependencies: @@ -726,6 +739,10 @@ packages: pretty-format: 27.5.1 dev: true + /@zxing/text-encoding@0.9.0: + resolution: {integrity: sha512-U/4aVJ2mxI0aDNI8Uq0wEhMgY+u4CNtEb0om3+y3+niDAsoTCOB33UF0sxpzqzdqXLqmvc+vZyAt4O8pPdfkwA==} + dev: false + /accepts@1.3.8: resolution: {integrity: sha512-PYAthTa2m2VKxuvSD3DPC/Gy+U+sOA1LAuT8mkmRuvw+NACSaeXEQ+NHcVF7rONl6qcaxV3Uuemwawk+7+SJLw==} engines: {node: '>= 0.6'} @@ -2263,6 +2280,11 @@ packages: engines: {node: '>=6'} dev: true + /whatwg-mimetype@3.0.0: + resolution: {integrity: sha512-nt+N2dzIutVRxARx1nghPKGv1xHikU7HKdfafKkLNLindmPU/ch3U31NOCGGA/dmPcmb1VlofO0vnKAcsm0o/Q==} + engines: {node: '>=12'} + dev: false + /which@2.0.2: resolution: {integrity: sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA==} engines: {node: '>= 8'} diff --git a/src/encoding.ts b/src/encoding.ts new file mode 100644 index 0000000..f132a54 --- /dev/null +++ b/src/encoding.ts @@ -0,0 +1,69 @@ +// due to the bug in the Cloudflare Workers runtime, we have to use @zxing/text-encoding instead of the built-in TextEncoder/TextDecoder. +import { encodingIndexes } from "@zxing/text-encoding/esm/encoding-indexes"; +(globalThis as any).TextEncodingIndexes = { encodingIndexes }; + +import { TextDecoder, TextEncoder } from "@zxing/text-encoding"; +import MIMEType from "whatwg-mimetype"; + +function getCharsetFromHeader(response: Response): string | null { + const contentType = response.headers.get("Content-Type"); + if (contentType === null) { + return null; + } + try { + const mimeType = new MIMEType(contentType); + return mimeType.parameters.get("charset") ?? null; + } catch { + return null; + } +} + +async function getCharsetFromBody(response: Response): Promise { + let charset: string | null = null; + const rewriter = new HTMLRewriter(); + rewriter.on("meta", { + element(element) { + const httpEquiv = element.getAttribute("http-equiv"); + if ( + charset === null && + httpEquiv !== null && + httpEquiv.toLowerCase() === "content-type" + ) { + const content = element.getAttribute("content"); + if (content !== null) { + try { + const mimeType = new MIMEType(content); + charset = mimeType.parameters.get("charset") ?? null; + } catch {} + } + } + const charsetAttr = element.getAttribute("charset"); + if (charsetAttr !== null) { + charset = charsetAttr; + } + }, + }); + const reader = rewriter.transform(response).body!.getReader(); + while (!(await reader.read()).done); + return charset; +} + +export async function getNormalizer( + response: Response +): Promise> { + const charset = + getCharsetFromHeader(response) ?? (await getCharsetFromBody(response)); + if (charset === null || charset.toLowerCase() === "utf-8") { + return new TransformStream(); + } + const decoder = new TextDecoder(charset, { fatal: true, ignoreBOM: true }); + const encoder = new TextEncoder(); + const transform = new TransformStream({ + transform(chunk, controller) { + controller.enqueue( + encoder.encode(decoder.decode(chunk, { stream: true })) + ); + }, + }); + return transform; +} diff --git a/src/index.ts b/src/index.ts index 3bedfc0..c848759 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,5 +1,6 @@ import { Hono } from "hono"; import summary from "./summary"; +import { getNormalizer } from "./encoding"; export interface Env { // Example binding to KV. Learn more at https://developers.cloudflare.com/workers/runtime-apis/kv/ @@ -17,6 +18,11 @@ export interface Env { const app = new Hono(); +app.onError((error, context) => { + console.error(error); + return context.json({ error: error.message, stack: error.stack }, 500); +}); + app.get("/url", async (context) => { let url: URL; try { @@ -26,9 +32,13 @@ app.get("/url", async (context) => { } const response = await fetch(url); url = new URL(response.url); + const [left, right] = response.body!.tee(); + const normalizer = await getNormalizer(new Response(left, response)); const rewriter = new HTMLRewriter(); const summarized = summary(url, rewriter); - const reader = rewriter.transform(response).body!.getReader(); + const reader = rewriter + .transform(new Response(right.pipeThrough(normalizer), response)) + .body!.getReader(); while (!(await reader.read()).done); return context.json(await summarized); });