fix: サマリ取得の動作改善+動作設定を可能にする (#23)

This commit is contained in:
おさむのひと 2024-03-17 16:53:32 +09:00 committed by GitHub
parent 71fe234d3e
commit c261071a82
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 322 additions and 70 deletions

View File

@ -130,13 +130,30 @@ async function getOEmbedPlayer($: cheerio.CheerioAPI, pageUrl: string): Promise<
};
}
export default async (_url: URL | string, lang: string | null = null): Promise<Summary | null> => {
export type GeneralScrapingOptions = {
lang?: string | null;
userAgent?: string;
responseTimeout?: number;
operationTimeout?: number;
contentLengthLimit?: number;
contentLengthRequired?: boolean;
}
export default async (_url: URL | string, opts?: GeneralScrapingOptions): Promise<Summary | null> => {
let lang = opts?.lang;
// eslint-disable-next-line no-param-reassign
if (lang && !lang.match(/^[\w-]+(\s*,\s*[\w-]+)*$/)) lang = null;
const url = typeof _url === 'string' ? new URL(_url) : _url;
const res = await scpaping(url.href, { lang: lang || undefined });
const res = await scpaping(url.href, {
lang: lang || undefined,
userAgent: opts?.userAgent,
responseTimeout: opts?.responseTimeout,
operationTimeout: opts?.operationTimeout,
contentLengthLimit: opts?.contentLengthLimit,
contentLengthRequired: opts?.contentLengthRequired,
});
const $ = res.$;
const twitterCard =
$('meta[name="twitter:card"]').attr('content') ||

View File

@ -9,7 +9,7 @@ import * as Got from 'got';
import { SummalyResult } from './summary.js';
import { SummalyPlugin } from './iplugin.js';
export * from './iplugin.js';
import general from './general.js';
import general, { GeneralScrapingOptions } from './general.js';
import { setAgent } from './utils/got.js';
import { plugins as builtinPlugins } from './plugins/index.js';
import type { FastifyInstance } from 'fastify';
@ -34,6 +34,35 @@ export type SummalyOptions = {
* Custom HTTP agent
*/
agent?: Got.Agents;
/**
* User-Agent for the request
*/
userAgent?: string;
/**
* Response timeout.
* Set timeouts for each phase, such as host name resolution and socket communication.
*/
responseTimeout?: number;
/**
* Operation timeout.
* Set the timeout from the start to the end of the request.
*/
operationTimeout?: number;
/**
* Maximum content length.
* If set to true, an error will occur if the content-length value returned from the other server is larger than this parameter (or if the received body size exceeds this parameter).
*/
contentLengthLimit?: number;
/**
* Content length required.
* If set to true, it will be an error if the other server does not return content-length.
*/
contentLengthRequired?: boolean;
};
export const summalyDefaultOptions = {
@ -68,8 +97,17 @@ export const summaly = async (url: string, options?: SummalyOptions): Promise<Su
const match = plugins.filter(plugin => plugin.test(_url))[0];
// Get summary
const scrapingOptions: GeneralScrapingOptions = {
lang: opts.lang,
userAgent: opts.userAgent,
responseTimeout: opts.responseTimeout,
operationTimeout: opts.operationTimeout,
contentLengthLimit: opts.contentLengthLimit,
contentLengthRequired: opts.contentLengthRequired,
};
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
const summary = await (match ? match.summarize : general)(_url, opts.lang || undefined);
const summary = await (match ? match.summarize : general)(_url, scrapingOptions);
if (summary == null) {
throw new Error('failed summarize');

View File

@ -1,7 +1,8 @@
import Summary from './summary.js';
import type { URL } from 'node:url';
import { GeneralScrapingOptions } from '@/general';
export interface SummalyPlugin {
test: (url: URL) => boolean;
summarize: (url: URL, lang?: string) => Promise<Summary | null>;
summarize: (url: URL, opts?: GeneralScrapingOptions) => Promise<Summary | null>;
}

View File

@ -1,5 +1,5 @@
import { URL } from 'node:url';
import general from '../general.js';
import general, { GeneralScrapingOptions } from '../general.js';
import Summary from '../summary.js';
export function test(url: URL): boolean {
@ -8,10 +8,10 @@ export function test(url: URL): boolean {
url.hostname === 'spotify.link';
}
export async function summarize(url: URL, lang: string | null = null): Promise<Summary | null> {
export async function summarize(url: URL, opts?: GeneralScrapingOptions): Promise<Summary | null> {
// https://help.branch.io/using-branch/docs/creating-a-deep-link#redirections
// Web版に強制リダイレクトすることでbranch.ioの独自ページが開くのを防ぐ
url.searchParams.append('$web_only', 'true');
return await general(url, lang);
return await general(url, opts);
}

View File

@ -11,6 +11,7 @@ const _filename = fileURLToPath(import.meta.url);
const _dirname = dirname(_filename);
export let agent: Got.Agents = {};
export function setAgent(_agent: Got.Agents) {
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
agent = _agent || {};
@ -22,34 +23,60 @@ export type GotOptions = {
body?: string;
headers: Record<string, string | undefined>;
typeFilter?: RegExp;
responseTimeout?: number;
operationTimeout?: number;
contentLengthLimit?: number;
contentLengthRequired?: boolean;
}
const repo = JSON.parse(readFileSync(`${_dirname}/../../package.json`, 'utf8'));
const RESPONSE_TIMEOUT = 20 * 1000;
const OPERATION_TIMEOUT = 60 * 1000;
const MAX_RESPONSE_SIZE = 10 * 1024 * 1024;
const BOT_UA = `SummalyBot/${repo.version}`;
const DEFAULT_RESPONSE_TIMEOUT = 20 * 1000;
const DEFAULT_OPERATION_TIMEOUT = 60 * 1000;
const DEFAULT_MAX_RESPONSE_SIZE = 10 * 1024 * 1024;
const DEFAULT_BOT_UA = `SummalyBot/${repo.version}`;
export async function scpaping(url: string, opts?: { lang?: string; }) {
const response = await getResponse({
export async function scpaping(
url: string,
opts?: {
lang?: string;
userAgent?: string;
responseTimeout?: number;
operationTimeout?: number;
contentLengthLimit?: number;
contentLengthRequired?: boolean;
},
) {
const args: Omit<GotOptions, 'method'> = {
url,
method: 'GET',
headers: {
'accept': 'text/html,application/xhtml+xml',
'user-agent': BOT_UA,
'user-agent': opts?.userAgent ?? DEFAULT_BOT_UA,
'accept-language': opts?.lang,
},
typeFilter: /^(text\/html|application\/xhtml\+xml)/,
responseTimeout: opts?.responseTimeout,
operationTimeout: opts?.operationTimeout,
contentLengthLimit: opts?.contentLengthLimit,
contentLengthRequired: opts?.contentLengthRequired,
};
const headResponse = await getResponse({
...args,
method: 'HEAD',
});
// SUMMALY_ALLOW_PRIVATE_IPはテスト用
const allowPrivateIp = process.env.SUMMALY_ALLOW_PRIVATE_IP === 'true' || Object.keys(agent).length > 0;
if (!allowPrivateIp && response.ip && PrivateIp(response.ip)) {
throw new StatusError(`Private IP rejected ${response.ip}`, 400, 'Private IP Rejected');
if (!allowPrivateIp && headResponse.ip && PrivateIp(headResponse.ip)) {
throw new StatusError(`Private IP rejected ${headResponse.ip}`, 400, 'Private IP Rejected');
}
const response = await getResponse({
...args,
method: 'GET',
});
const encoding = detectEncoding(response.rawBody);
const body = toUtf8(response.rawBody, encoding);
const $ = cheerio.load(body);
@ -70,24 +97,22 @@ export async function get(url: string) {
},
});
return await res.body;
return res.body;
}
export async function head(url: string) {
const res = await getResponse({
return await getResponse({
url,
method: 'HEAD',
headers: {
'accept': '*/*',
},
});
return await res;
}
async function getResponse(args: GotOptions) {
const timeout = RESPONSE_TIMEOUT;
const operationTimeout = OPERATION_TIMEOUT;
const timeout = args.responseTimeout ?? DEFAULT_RESPONSE_TIMEOUT;
const operationTimeout = args.operationTimeout ?? DEFAULT_OPERATION_TIMEOUT;
const req = got<string>(args.url, {
method: args.method,
@ -109,30 +134,37 @@ async function getResponse(args: GotOptions) {
},
});
return await receiveResponse({ req, typeFilter: args.typeFilter });
const res = await receiveResponse({ req, opts: args });
// Check html
const contentType = res.headers['content-type'];
if (args.typeFilter && !contentType?.match(args.typeFilter)) {
throw new Error(`Rejected by type filter ${contentType}`);
}
// 応答ヘッダでサイズチェック
const contentLength = res.headers['content-length'];
if (contentLength) {
const maxSize = args.contentLengthLimit ?? DEFAULT_MAX_RESPONSE_SIZE;
const size = Number(contentLength);
if (size > maxSize) {
throw new Error(`maxSize exceeded (${size} > ${maxSize}) on response`);
}
} else {
if (args.contentLengthRequired) {
throw new Error('content-length required');
}
}
return res;
}
async function receiveResponse<T>(args: { req: Got.CancelableRequest<Got.Response<T>>, typeFilter?: RegExp }) {
async function receiveResponse<T>(args: {
req: Got.CancelableRequest<Got.Response<T>>,
opts: GotOptions,
}) {
const req = args.req;
const maxSize = MAX_RESPONSE_SIZE;
req.on('response', (res: Got.Response) => {
// Check html
if (args.typeFilter && !res.headers['content-type']?.match(args.typeFilter)) {
// console.warn(res.headers['content-type']);
req.cancel(`Rejected by type filter ${res.headers['content-type']}`);
return;
}
// 応答ヘッダでサイズチェック
const contentLength = res.headers['content-length'];
if (contentLength != null) {
const size = Number(contentLength);
if (size > maxSize) {
req.cancel(`maxSize exceeded (${size} > ${maxSize}) on response`);
}
}
});
const maxSize = args.opts.contentLengthLimit ?? DEFAULT_MAX_RESPONSE_SIZE;
// 受信中のデータでサイズチェック
req.on('downloadProgress', (progress: Got.Progress) => {

View File

@ -48,7 +48,10 @@ afterEach(async () => {
test('basic', async () => {
app = fastify();
app.get('/', (request, reply) => {
return reply.send(fs.createReadStream(_dirname + '/htmls/basic.html'));
const content = fs.readFileSync(_dirname + '/htmls/basic.html');
reply.header('content-length', content.length);
reply.header('content-type', 'text/html');
return reply.send(content);
});
await app.listen({ port });
expect(await summaly(host)).toEqual({
@ -80,7 +83,7 @@ test('Stage Bye Stage', async () => {
expect(summary).toEqual(
{
'title': '【アイドルマスター】「Stage Bye Stage」(歌:島村卯月、渋谷凛、本田未央)',
'icon': 'https://www.youtube.com/s/desktop/28b0985e/img/favicon.ico',
'icon': 'https://www.youtube.com/s/desktop/4feff1e2/img/favicon.ico',
'description': 'Website▶https://columbia.jp/idolmaster/Playlist▶https://www.youtube.com/playlist?list=PL83A2998CF3BBC86D2018年7月18日発売予定THE IDOLM@STER CINDERELLA GIRLS CG STAR...',
'thumbnail': 'https://i.ytimg.com/vi/NMIEAhH_fTU/maxresdefault.jpg',
'player': {
@ -107,7 +110,10 @@ test('Stage Bye Stage', async () => {
test('faviconがHTML上で指定されていないが、ルートに存在する場合、正しく設定される', async () => {
app = fastify();
app.get('/', (request, reply) => {
return reply.send(fs.createReadStream(_dirname + '/htmls/no-favicon.html'));
const content = fs.readFileSync(_dirname + '/htmls/no-favicon.html');
reply.header('content-length', content.length);
reply.header('content-type', 'text/html');
return reply.send(content);
});
app.get('/favicon.ico', (_, reply) => reply.status(200).send());
await app.listen({ port });
@ -119,7 +125,10 @@ test('faviconがHTML上で指定されていないが、ルートに存在する
test('faviconがHTML上で指定されていなくて、ルートにも存在しなかった場合 null になる', async () => {
app = fastify();
app.get('/', (request, reply) => {
return reply.send(fs.createReadStream(_dirname + '/htmls/no-favicon.html'));
const content = fs.readFileSync(_dirname + '/htmls/no-favicon.html');
reply.header('content-length', content.length);
reply.header('content-type', 'text/html');
return reply.send(content);
});
app.get('*', (_, reply) => reply.status(404).send());
await app.listen({ port });
@ -131,7 +140,10 @@ test('faviconがHTML上で指定されていなくて、ルートにも存在し
test('titleがcleanupされる', async () => {
app = fastify();
app.get('/', (request, reply) => {
return reply.send(fs.createReadStream(_dirname + '/htmls/dirty-title.html'));
const content = fs.readFileSync(_dirname + '/htmls/og-title.html');
reply.header('content-length', content.length);
reply.header('content-type', 'text/html');
return reply.send(content);
});
await app.listen({ port });
@ -144,7 +156,10 @@ describe('Private IP blocking', () => {
process.env.SUMMALY_ALLOW_PRIVATE_IP = 'false';
app = fastify();
app.get('*', (request, reply) => {
return reply.send(fs.createReadStream(_dirname + '/htmls/og-title.html'));
const content = fs.readFileSync(_dirname + '/htmls/og-title.html');
reply.header('content-length', content.length);
reply.header('content-type', 'text/html');
return reply.send(content);
});
return app.listen({ port });
});
@ -186,7 +201,10 @@ describe('OGP', () => {
test('title', async () => {
app = fastify();
app.get('*', (request, reply) => {
return reply.send(fs.createReadStream(_dirname + '/htmls/og-title.html'));
const content = fs.readFileSync(_dirname + '/htmls/og-title.html');
reply.header('content-length', content.length);
reply.header('content-type', 'text/html');
return reply.send(content);
});
await app.listen({ port });
@ -197,7 +215,10 @@ describe('OGP', () => {
test('description', async () => {
app = fastify();
app.get('/', (request, reply) => {
return reply.send(fs.createReadStream(_dirname + '/htmls/og-description.html'));
const content = fs.readFileSync(_dirname + '/htmls/og-description.html');
reply.header('content-length', content.length);
reply.header('content-type', 'text/html');
return reply.send(content);
});
await app.listen({ port });
@ -208,7 +229,10 @@ describe('OGP', () => {
test('site_name', async () => {
app = fastify();
app.get('/', (request, reply) => {
return reply.send(fs.createReadStream(_dirname + '/htmls/og-site_name.html'));
const content = fs.readFileSync(_dirname + '/htmls/og-site_name.html');
reply.header('content-length', content.length);
reply.header('content-type', 'text/html');
return reply.send(content);
});
await app.listen({ port });
@ -219,7 +243,10 @@ describe('OGP', () => {
test('thumbnail', async () => {
app = fastify();
app.get('/', (request, reply) => {
return reply.send(fs.createReadStream(_dirname + '/htmls/og-image.html'));
const content = fs.readFileSync(_dirname + '/htmls/og-image.html');
reply.header('content-length', content.length);
reply.header('content-type', 'text/html');
return reply.send(content);
});
await app.listen({ port });
@ -232,7 +259,10 @@ describe('TwitterCard', () => {
test('title', async () => {
app = fastify();
app.get('/', (request, reply) => {
return reply.send(fs.createReadStream(_dirname + '/htmls/twitter-title.html'));
const content = fs.readFileSync(_dirname + '/htmls/twitter-title.html');
reply.header('content-length', content.length);
reply.header('content-type', 'text/html');
return reply.send(content);
});
await app.listen({ port });
@ -243,7 +273,10 @@ describe('TwitterCard', () => {
test('description', async () => {
app = fastify();
app.get('/', (request, reply) => {
return reply.send(fs.createReadStream(_dirname + '/htmls/twitter-description.html'));
const content = fs.readFileSync(_dirname + '/htmls/twitter-description.html');
reply.header('content-length', content.length);
reply.header('content-type', 'text/html');
return reply.send(content);
});
await app.listen({ port });
@ -254,7 +287,10 @@ describe('TwitterCard', () => {
test('thumbnail', async () => {
app = fastify();
app.get('/', (request, reply) => {
return reply.send(fs.createReadStream(_dirname + '/htmls/twitter-image.html'));
const content = fs.readFileSync(_dirname + '/htmls/twitter-image.html');
reply.header('content-length', content.length);
reply.header('content-type', 'text/html');
return reply.send(content);
});
await app.listen({ port });
@ -265,7 +301,10 @@ describe('TwitterCard', () => {
test('Player detection - PeerTube:video => video', async () => {
app = fastify();
app.get('/', (request, reply) => {
return reply.send(fs.createReadStream(_dirname + '/htmls/player-peertube-video.html'));
const content = fs.readFileSync(_dirname + '/htmls/player-peertube-video.html');
reply.header('content-length', content.length);
reply.header('content-type', 'text/html');
return reply.send(content);
});
await app.listen({ port });
@ -277,7 +316,10 @@ describe('TwitterCard', () => {
test('Player detection - Pleroma:video => video', async () => {
app = fastify();
app.get('/', (request, reply) => {
return reply.send(fs.createReadStream(_dirname + '/htmls/player-pleroma-video.html'));
const content = fs.readFileSync(_dirname + '/htmls/player-pleroma-video.html');
reply.header('content-length', content.length);
reply.header('content-type', 'text/html');
return reply.send(content);
});
await app.listen({ port });
@ -289,7 +331,10 @@ describe('TwitterCard', () => {
test('Player detection - Pleroma:image => image', async () => {
app = fastify();
app.get('/', (request, reply) => {
return reply.send(fs.createReadStream(_dirname + '/htmls/player-pleroma-image.html'));
const content = fs.readFileSync(_dirname + '/htmls/player-pleroma-image.html');
reply.header('content-length', content.length);
reply.header('content-type', 'text/html');
return reply.send(content);
});
await app.listen({ port });
@ -302,12 +347,16 @@ describe('oEmbed', () => {
const setUpFastify = async (oEmbedPath: string, htmlPath = 'htmls/oembed.html') => {
app = fastify();
app.get('/', (request, reply) => {
return reply.send(fs.createReadStream(new URL(htmlPath, import.meta.url)));
const content = fs.readFileSync(new URL(htmlPath, import.meta.url));
reply.header('content-length', content.length);
reply.header('content-type', 'text/html');
return reply.send(content);
});
app.get('/oembed.json', (request, reply) => {
return reply.send(fs.createReadStream(
new URL(oEmbedPath, new URL('oembed/', import.meta.url)),
));
const content = fs.readFileSync(new URL(oEmbedPath, new URL('oembed/', import.meta.url)));
reply.header('content-length', content.length);
reply.header('content-type', 'application/json');
return reply.send(content);
});
await app.listen({ port });
};
@ -432,7 +481,10 @@ describe('ActivityPub', () => {
test('Basic', async () => {
app = fastify();
app.get('*', (request, reply) => {
return reply.send(fs.createReadStream(_dirname + '/htmls/activitypub.html'));
const content = fs.readFileSync(_dirname + '/htmls/activitypub.html');
reply.header('content-length', content.length);
reply.header('content-type', 'text/html');
return reply.send(content);
});
await app.listen({ port });
@ -443,7 +495,10 @@ describe('ActivityPub', () => {
test('Null', async () => {
app = fastify();
app.get('*', (request, reply) => {
return reply.send(fs.createReadStream(_dirname + '/htmls/basic.html'));
const content = fs.readFileSync(_dirname + '/htmls/basic.html');
reply.header('content-length', content.length);
reply.header('content-type', 'text/html');
return reply.send(content);
});
await app.listen({ port });
@ -456,7 +511,10 @@ describe('sensitive', () => {
test('default', async () => {
app = fastify();
app.get('/', (request, reply) => {
return reply.send(fs.createReadStream(_dirname + '/htmls/basic.html'));
const content = fs.readFileSync(_dirname + '/htmls/basic.html');
reply.header('content-length', content.length);
reply.header('content-type', 'text/html');
return reply.send(content);
});
await app.listen({ port });
expect((await summaly(host)).sensitive).toBe(false);
@ -465,9 +523,115 @@ describe('sensitive', () => {
test('mixi:content-rating 1', async () => {
app = fastify();
app.get('/', (request, reply) => {
return reply.send(fs.createReadStream(_dirname + '/htmls/mixi-sensitive.html'));
const content = fs.readFileSync(_dirname + '/htmls/mixi-sensitive.html');
reply.header('content-length', content.length);
reply.header('content-type', 'text/html');
return reply.send(content);
});
await app.listen({ port });
expect((await summaly(host)).sensitive).toBe(true);
});
});
describe('UserAgent', () => {
test('UA設定が反映されていること', async () => {
const content = fs.readFileSync(_dirname + '/htmls/basic.html');
let ua: string | undefined = undefined;
app = fastify();
app.get('/', (request, reply) => {
reply.header('content-length', content.byteLength);
reply.header('content-type', 'text/html');
ua = request.headers['user-agent'];
return reply.send(content);
});
await app.listen({ port });
await summaly(host, { userAgent: 'test-ua' });
expect(ua).toBe('test-ua');
});
});
describe('content-length limit', () => {
test('content-lengthの上限以内であればエラーが起こらないこと', async () => {
const content = fs.readFileSync(_dirname + '/htmls/basic.html');
app = fastify();
app.get('/', (request, reply) => {
reply.header('content-length', content.byteLength);
reply.header('content-type', 'text/html');
return reply.send(content);
});
await app.listen({ port });
expect(await summaly(host, { contentLengthLimit: content.byteLength })).toBeDefined();
});
test('content-lengthの上限を超えているとエラーになる事', async () => {
const content = fs.readFileSync(_dirname + '/htmls/basic.html');
app = fastify();
app.get('/', (request, reply) => {
reply.header('content-length', content.byteLength);
reply.header('content-type', 'text/html');
return reply.send(content);
});
await app.listen({ port });
await expect(summaly(host, { contentLengthLimit: content.byteLength - 1 })).rejects.toThrow();
});
});
describe('content-length required', () => {
test('[オプション有効化時] content-lengthが返された場合はエラーとならないこと', async () => {
const content = fs.readFileSync(_dirname + '/htmls/basic.html');
app = fastify();
app.get('/', (request, reply) => {
reply.header('content-length', content.byteLength);
reply.header('content-type', 'text/html');
return reply.send(content);
});
await app.listen({ port });
expect(await summaly(host, { contentLengthRequired: true, contentLengthLimit: content.byteLength })).toBeDefined();
});
test('[オプション有効化時] content-lengthが返されない場合はエラーとなること', async () => {
app = fastify();
app.get('/', (request, reply) => {
reply.header('content-type', 'text/html');
// streamで渡さないとcontent-lengthを自動で設定されてしまう
return reply.send(fs.createReadStream(_dirname + '/htmls/basic.html'));
});
await app.listen({ port });
await expect(summaly(host, { contentLengthRequired: true })).rejects.toThrow();
});
test('[オプション無効化時] content-lengthが返された場合はエラーとならないこと', async () => {
const content = fs.readFileSync(_dirname + '/htmls/basic.html');
app = fastify();
app.get('/', (request, reply) => {
reply.header('content-length', content.byteLength);
reply.header('content-type', 'text/html');
return reply.send(content);
});
await app.listen({ port });
expect(await summaly(host, { contentLengthRequired: false, contentLengthLimit: content.byteLength })).toBeDefined();
});
test('[オプション無効化時] content-lengthが返されなくてもエラーとならないこと', async () => {
app = fastify();
app.get('/', (request, reply) => {
reply.header('content-type', 'text/html');
// streamで渡さないとcontent-lengthを自動で設定されてしまう
return reply.send(fs.createReadStream(_dirname + '/htmls/basic.html'));
});
await app.listen({ port });
expect(await summaly(host, { contentLengthRequired: false })).toBeDefined();
});
});