diff --git a/src/general.ts b/src/general.ts index 124cc59..c7882ba 100644 --- a/src/general.ts +++ b/src/general.ts @@ -130,13 +130,30 @@ async function getOEmbedPlayer($: cheerio.CheerioAPI, pageUrl: string): Promise< }; } -export default async (_url: URL | string, lang: string | null = null): Promise => { +export type GeneralScrapingOptions = { + lang?: string | null; + userAgent?: string; + responseTimeout?: number; + operationTimeout?: number; + contentLengthLimit?: number; + contentLengthRequired?: boolean; +} + +export default async (_url: URL | string, opts?: GeneralScrapingOptions): Promise => { + let lang = opts?.lang; // eslint-disable-next-line no-param-reassign if (lang && !lang.match(/^[\w-]+(\s*,\s*[\w-]+)*$/)) lang = null; const url = typeof _url === 'string' ? new URL(_url) : _url; - const res = await scpaping(url.href, { lang: lang || undefined }); + const res = await scpaping(url.href, { + lang: lang || undefined, + userAgent: opts?.userAgent, + responseTimeout: opts?.responseTimeout, + operationTimeout: opts?.operationTimeout, + contentLengthLimit: opts?.contentLengthLimit, + contentLengthRequired: opts?.contentLengthRequired, + }); const $ = res.$; const twitterCard = $('meta[name="twitter:card"]').attr('content') || diff --git a/src/index.ts b/src/index.ts index b19269a..8070fb9 100644 --- a/src/index.ts +++ b/src/index.ts @@ -9,7 +9,7 @@ import * as Got from 'got'; import { SummalyResult } from './summary.js'; import { SummalyPlugin } from './iplugin.js'; export * from './iplugin.js'; -import general from './general.js'; +import general, { GeneralScrapingOptions } from './general.js'; import { setAgent } from './utils/got.js'; import { plugins as builtinPlugins } from './plugins/index.js'; import type { FastifyInstance } from 'fastify'; @@ -34,6 +34,35 @@ export type SummalyOptions = { * Custom HTTP agent */ agent?: Got.Agents; + + /** + * User-Agent for the request + */ + userAgent?: string; + + /** + * Response timeout. + * Set timeouts for each phase, such as host name resolution and socket communication. + */ + responseTimeout?: number; + + /** + * Operation timeout. + * Set the timeout from the start to the end of the request. + */ + operationTimeout?: number; + + /** + * Maximum content length. + * If set to true, an error will occur if the content-length value returned from the other server is larger than this parameter (or if the received body size exceeds this parameter). + */ + contentLengthLimit?: number; + + /** + * Content length required. + * If set to true, it will be an error if the other server does not return content-length. + */ + contentLengthRequired?: boolean; }; export const summalyDefaultOptions = { @@ -68,8 +97,17 @@ export const summaly = async (url: string, options?: SummalyOptions): Promise plugin.test(_url))[0]; // Get summary + const scrapingOptions: GeneralScrapingOptions = { + lang: opts.lang, + userAgent: opts.userAgent, + responseTimeout: opts.responseTimeout, + operationTimeout: opts.operationTimeout, + contentLengthLimit: opts.contentLengthLimit, + contentLengthRequired: opts.contentLengthRequired, + }; + // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition - const summary = await (match ? match.summarize : general)(_url, opts.lang || undefined); + const summary = await (match ? match.summarize : general)(_url, scrapingOptions); if (summary == null) { throw new Error('failed summarize'); diff --git a/src/iplugin.ts b/src/iplugin.ts index bb87629..f910f9b 100644 --- a/src/iplugin.ts +++ b/src/iplugin.ts @@ -1,7 +1,8 @@ import Summary from './summary.js'; import type { URL } from 'node:url'; +import { GeneralScrapingOptions } from '@/general'; export interface SummalyPlugin { test: (url: URL) => boolean; - summarize: (url: URL, lang?: string) => Promise; + summarize: (url: URL, opts?: GeneralScrapingOptions) => Promise; } diff --git a/src/plugins/branchio-deeplinks.ts b/src/plugins/branchio-deeplinks.ts index 2eacf33..67a1e3b 100644 --- a/src/plugins/branchio-deeplinks.ts +++ b/src/plugins/branchio-deeplinks.ts @@ -1,5 +1,5 @@ import { URL } from 'node:url'; -import general from '../general.js'; +import general, { GeneralScrapingOptions } from '../general.js'; import Summary from '../summary.js'; export function test(url: URL): boolean { @@ -8,10 +8,10 @@ export function test(url: URL): boolean { url.hostname === 'spotify.link'; } -export async function summarize(url: URL, lang: string | null = null): Promise { +export async function summarize(url: URL, opts?: GeneralScrapingOptions): Promise { // https://help.branch.io/using-branch/docs/creating-a-deep-link#redirections // Web版に強制リダイレクトすることでbranch.ioの独自ページが開くのを防ぐ url.searchParams.append('$web_only', 'true'); - return await general(url, lang); + return await general(url, opts); } diff --git a/src/utils/got.ts b/src/utils/got.ts index da6ee6c..f54ad41 100644 --- a/src/utils/got.ts +++ b/src/utils/got.ts @@ -11,6 +11,7 @@ const _filename = fileURLToPath(import.meta.url); const _dirname = dirname(_filename); export let agent: Got.Agents = {}; + export function setAgent(_agent: Got.Agents) { // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition agent = _agent || {}; @@ -22,34 +23,60 @@ export type GotOptions = { body?: string; headers: Record; typeFilter?: RegExp; + responseTimeout?: number; + operationTimeout?: number; + contentLengthLimit?: number; + contentLengthRequired?: boolean; } const repo = JSON.parse(readFileSync(`${_dirname}/../../package.json`, 'utf8')); -const RESPONSE_TIMEOUT = 20 * 1000; -const OPERATION_TIMEOUT = 60 * 1000; -const MAX_RESPONSE_SIZE = 10 * 1024 * 1024; -const BOT_UA = `SummalyBot/${repo.version}`; +const DEFAULT_RESPONSE_TIMEOUT = 20 * 1000; +const DEFAULT_OPERATION_TIMEOUT = 60 * 1000; +const DEFAULT_MAX_RESPONSE_SIZE = 10 * 1024 * 1024; +const DEFAULT_BOT_UA = `SummalyBot/${repo.version}`; -export async function scpaping(url: string, opts?: { lang?: string; }) { - const response = await getResponse({ +export async function scpaping( + url: string, + opts?: { + lang?: string; + userAgent?: string; + responseTimeout?: number; + operationTimeout?: number; + contentLengthLimit?: number; + contentLengthRequired?: boolean; + }, +) { + const args: Omit = { url, - method: 'GET', headers: { 'accept': 'text/html,application/xhtml+xml', - 'user-agent': BOT_UA, + 'user-agent': opts?.userAgent ?? DEFAULT_BOT_UA, 'accept-language': opts?.lang, }, typeFilter: /^(text\/html|application\/xhtml\+xml)/, + responseTimeout: opts?.responseTimeout, + operationTimeout: opts?.operationTimeout, + contentLengthLimit: opts?.contentLengthLimit, + contentLengthRequired: opts?.contentLengthRequired, + }; + + const headResponse = await getResponse({ + ...args, + method: 'HEAD', }); // SUMMALY_ALLOW_PRIVATE_IPはテスト用 const allowPrivateIp = process.env.SUMMALY_ALLOW_PRIVATE_IP === 'true' || Object.keys(agent).length > 0; - - if (!allowPrivateIp && response.ip && PrivateIp(response.ip)) { - throw new StatusError(`Private IP rejected ${response.ip}`, 400, 'Private IP Rejected'); + if (!allowPrivateIp && headResponse.ip && PrivateIp(headResponse.ip)) { + throw new StatusError(`Private IP rejected ${headResponse.ip}`, 400, 'Private IP Rejected'); } + const response = await getResponse({ + ...args, + method: 'GET', + }); + const encoding = detectEncoding(response.rawBody); const body = toUtf8(response.rawBody, encoding); const $ = cheerio.load(body); @@ -70,24 +97,22 @@ export async function get(url: string) { }, }); - return await res.body; + return res.body; } export async function head(url: string) { - const res = await getResponse({ + return await getResponse({ url, method: 'HEAD', headers: { 'accept': '*/*', }, }); - - return await res; } async function getResponse(args: GotOptions) { - const timeout = RESPONSE_TIMEOUT; - const operationTimeout = OPERATION_TIMEOUT; + const timeout = args.responseTimeout ?? DEFAULT_RESPONSE_TIMEOUT; + const operationTimeout = args.operationTimeout ?? DEFAULT_OPERATION_TIMEOUT; const req = got(args.url, { method: args.method, @@ -109,30 +134,37 @@ async function getResponse(args: GotOptions) { }, }); - return await receiveResponse({ req, typeFilter: args.typeFilter }); + const res = await receiveResponse({ req, opts: args }); + + // Check html + const contentType = res.headers['content-type']; + if (args.typeFilter && !contentType?.match(args.typeFilter)) { + throw new Error(`Rejected by type filter ${contentType}`); + } + + // 応答ヘッダでサイズチェック + const contentLength = res.headers['content-length']; + if (contentLength) { + const maxSize = args.contentLengthLimit ?? DEFAULT_MAX_RESPONSE_SIZE; + const size = Number(contentLength); + if (size > maxSize) { + throw new Error(`maxSize exceeded (${size} > ${maxSize}) on response`); + } + } else { + if (args.contentLengthRequired) { + throw new Error('content-length required'); + } + } + + return res; } -async function receiveResponse(args: { req: Got.CancelableRequest>, typeFilter?: RegExp }) { +async function receiveResponse(args: { + req: Got.CancelableRequest>, + opts: GotOptions, +}) { const req = args.req; - const maxSize = MAX_RESPONSE_SIZE; - - req.on('response', (res: Got.Response) => { - // Check html - if (args.typeFilter && !res.headers['content-type']?.match(args.typeFilter)) { - // console.warn(res.headers['content-type']); - req.cancel(`Rejected by type filter ${res.headers['content-type']}`); - return; - } - - // 応答ヘッダでサイズチェック - const contentLength = res.headers['content-length']; - if (contentLength != null) { - const size = Number(contentLength); - if (size > maxSize) { - req.cancel(`maxSize exceeded (${size} > ${maxSize}) on response`); - } - } - }); + const maxSize = args.opts.contentLengthLimit ?? DEFAULT_MAX_RESPONSE_SIZE; // 受信中のデータでサイズチェック req.on('downloadProgress', (progress: Got.Progress) => { diff --git a/test/index.ts b/test/index.ts index f78619c..c2a2871 100644 --- a/test/index.ts +++ b/test/index.ts @@ -48,7 +48,10 @@ afterEach(async () => { test('basic', async () => { app = fastify(); app.get('/', (request, reply) => { - return reply.send(fs.createReadStream(_dirname + '/htmls/basic.html')); + const content = fs.readFileSync(_dirname + '/htmls/basic.html'); + reply.header('content-length', content.length); + reply.header('content-type', 'text/html'); + return reply.send(content); }); await app.listen({ port }); expect(await summaly(host)).toEqual({ @@ -80,7 +83,7 @@ test('Stage Bye Stage', async () => { expect(summary).toEqual( { 'title': '【アイドルマスター】「Stage Bye Stage」(歌:島村卯月、渋谷凛、本田未央)', - 'icon': 'https://www.youtube.com/s/desktop/28b0985e/img/favicon.ico', + 'icon': 'https://www.youtube.com/s/desktop/4feff1e2/img/favicon.ico', 'description': 'Website▶https://columbia.jp/idolmaster/Playlist▶https://www.youtube.com/playlist?list=PL83A2998CF3BBC86D2018年7月18日発売予定THE IDOLM@STER CINDERELLA GIRLS CG STAR...', 'thumbnail': 'https://i.ytimg.com/vi/NMIEAhH_fTU/maxresdefault.jpg', 'player': { @@ -107,7 +110,10 @@ test('Stage Bye Stage', async () => { test('faviconがHTML上で指定されていないが、ルートに存在する場合、正しく設定される', async () => { app = fastify(); app.get('/', (request, reply) => { - return reply.send(fs.createReadStream(_dirname + '/htmls/no-favicon.html')); + const content = fs.readFileSync(_dirname + '/htmls/no-favicon.html'); + reply.header('content-length', content.length); + reply.header('content-type', 'text/html'); + return reply.send(content); }); app.get('/favicon.ico', (_, reply) => reply.status(200).send()); await app.listen({ port }); @@ -119,7 +125,10 @@ test('faviconがHTML上で指定されていないが、ルートに存在する test('faviconがHTML上で指定されていなくて、ルートにも存在しなかった場合 null になる', async () => { app = fastify(); app.get('/', (request, reply) => { - return reply.send(fs.createReadStream(_dirname + '/htmls/no-favicon.html')); + const content = fs.readFileSync(_dirname + '/htmls/no-favicon.html'); + reply.header('content-length', content.length); + reply.header('content-type', 'text/html'); + return reply.send(content); }); app.get('*', (_, reply) => reply.status(404).send()); await app.listen({ port }); @@ -131,7 +140,10 @@ test('faviconがHTML上で指定されていなくて、ルートにも存在し test('titleがcleanupされる', async () => { app = fastify(); app.get('/', (request, reply) => { - return reply.send(fs.createReadStream(_dirname + '/htmls/dirty-title.html')); + const content = fs.readFileSync(_dirname + '/htmls/og-title.html'); + reply.header('content-length', content.length); + reply.header('content-type', 'text/html'); + return reply.send(content); }); await app.listen({ port }); @@ -144,7 +156,10 @@ describe('Private IP blocking', () => { process.env.SUMMALY_ALLOW_PRIVATE_IP = 'false'; app = fastify(); app.get('*', (request, reply) => { - return reply.send(fs.createReadStream(_dirname + '/htmls/og-title.html')); + const content = fs.readFileSync(_dirname + '/htmls/og-title.html'); + reply.header('content-length', content.length); + reply.header('content-type', 'text/html'); + return reply.send(content); }); return app.listen({ port }); }); @@ -186,7 +201,10 @@ describe('OGP', () => { test('title', async () => { app = fastify(); app.get('*', (request, reply) => { - return reply.send(fs.createReadStream(_dirname + '/htmls/og-title.html')); + const content = fs.readFileSync(_dirname + '/htmls/og-title.html'); + reply.header('content-length', content.length); + reply.header('content-type', 'text/html'); + return reply.send(content); }); await app.listen({ port }); @@ -197,7 +215,10 @@ describe('OGP', () => { test('description', async () => { app = fastify(); app.get('/', (request, reply) => { - return reply.send(fs.createReadStream(_dirname + '/htmls/og-description.html')); + const content = fs.readFileSync(_dirname + '/htmls/og-description.html'); + reply.header('content-length', content.length); + reply.header('content-type', 'text/html'); + return reply.send(content); }); await app.listen({ port }); @@ -208,7 +229,10 @@ describe('OGP', () => { test('site_name', async () => { app = fastify(); app.get('/', (request, reply) => { - return reply.send(fs.createReadStream(_dirname + '/htmls/og-site_name.html')); + const content = fs.readFileSync(_dirname + '/htmls/og-site_name.html'); + reply.header('content-length', content.length); + reply.header('content-type', 'text/html'); + return reply.send(content); }); await app.listen({ port }); @@ -219,7 +243,10 @@ describe('OGP', () => { test('thumbnail', async () => { app = fastify(); app.get('/', (request, reply) => { - return reply.send(fs.createReadStream(_dirname + '/htmls/og-image.html')); + const content = fs.readFileSync(_dirname + '/htmls/og-image.html'); + reply.header('content-length', content.length); + reply.header('content-type', 'text/html'); + return reply.send(content); }); await app.listen({ port }); @@ -232,7 +259,10 @@ describe('TwitterCard', () => { test('title', async () => { app = fastify(); app.get('/', (request, reply) => { - return reply.send(fs.createReadStream(_dirname + '/htmls/twitter-title.html')); + const content = fs.readFileSync(_dirname + '/htmls/twitter-title.html'); + reply.header('content-length', content.length); + reply.header('content-type', 'text/html'); + return reply.send(content); }); await app.listen({ port }); @@ -243,7 +273,10 @@ describe('TwitterCard', () => { test('description', async () => { app = fastify(); app.get('/', (request, reply) => { - return reply.send(fs.createReadStream(_dirname + '/htmls/twitter-description.html')); + const content = fs.readFileSync(_dirname + '/htmls/twitter-description.html'); + reply.header('content-length', content.length); + reply.header('content-type', 'text/html'); + return reply.send(content); }); await app.listen({ port }); @@ -254,7 +287,10 @@ describe('TwitterCard', () => { test('thumbnail', async () => { app = fastify(); app.get('/', (request, reply) => { - return reply.send(fs.createReadStream(_dirname + '/htmls/twitter-image.html')); + const content = fs.readFileSync(_dirname + '/htmls/twitter-image.html'); + reply.header('content-length', content.length); + reply.header('content-type', 'text/html'); + return reply.send(content); }); await app.listen({ port }); @@ -265,7 +301,10 @@ describe('TwitterCard', () => { test('Player detection - PeerTube:video => video', async () => { app = fastify(); app.get('/', (request, reply) => { - return reply.send(fs.createReadStream(_dirname + '/htmls/player-peertube-video.html')); + const content = fs.readFileSync(_dirname + '/htmls/player-peertube-video.html'); + reply.header('content-length', content.length); + reply.header('content-type', 'text/html'); + return reply.send(content); }); await app.listen({ port }); @@ -277,7 +316,10 @@ describe('TwitterCard', () => { test('Player detection - Pleroma:video => video', async () => { app = fastify(); app.get('/', (request, reply) => { - return reply.send(fs.createReadStream(_dirname + '/htmls/player-pleroma-video.html')); + const content = fs.readFileSync(_dirname + '/htmls/player-pleroma-video.html'); + reply.header('content-length', content.length); + reply.header('content-type', 'text/html'); + return reply.send(content); }); await app.listen({ port }); @@ -289,7 +331,10 @@ describe('TwitterCard', () => { test('Player detection - Pleroma:image => image', async () => { app = fastify(); app.get('/', (request, reply) => { - return reply.send(fs.createReadStream(_dirname + '/htmls/player-pleroma-image.html')); + const content = fs.readFileSync(_dirname + '/htmls/player-pleroma-image.html'); + reply.header('content-length', content.length); + reply.header('content-type', 'text/html'); + return reply.send(content); }); await app.listen({ port }); @@ -302,12 +347,16 @@ describe('oEmbed', () => { const setUpFastify = async (oEmbedPath: string, htmlPath = 'htmls/oembed.html') => { app = fastify(); app.get('/', (request, reply) => { - return reply.send(fs.createReadStream(new URL(htmlPath, import.meta.url))); + const content = fs.readFileSync(new URL(htmlPath, import.meta.url)); + reply.header('content-length', content.length); + reply.header('content-type', 'text/html'); + return reply.send(content); }); app.get('/oembed.json', (request, reply) => { - return reply.send(fs.createReadStream( - new URL(oEmbedPath, new URL('oembed/', import.meta.url)), - )); + const content = fs.readFileSync(new URL(oEmbedPath, new URL('oembed/', import.meta.url))); + reply.header('content-length', content.length); + reply.header('content-type', 'application/json'); + return reply.send(content); }); await app.listen({ port }); }; @@ -432,7 +481,10 @@ describe('ActivityPub', () => { test('Basic', async () => { app = fastify(); app.get('*', (request, reply) => { - return reply.send(fs.createReadStream(_dirname + '/htmls/activitypub.html')); + const content = fs.readFileSync(_dirname + '/htmls/activitypub.html'); + reply.header('content-length', content.length); + reply.header('content-type', 'text/html'); + return reply.send(content); }); await app.listen({ port }); @@ -443,7 +495,10 @@ describe('ActivityPub', () => { test('Null', async () => { app = fastify(); app.get('*', (request, reply) => { - return reply.send(fs.createReadStream(_dirname + '/htmls/basic.html')); + const content = fs.readFileSync(_dirname + '/htmls/basic.html'); + reply.header('content-length', content.length); + reply.header('content-type', 'text/html'); + return reply.send(content); }); await app.listen({ port }); @@ -456,7 +511,10 @@ describe('sensitive', () => { test('default', async () => { app = fastify(); app.get('/', (request, reply) => { - return reply.send(fs.createReadStream(_dirname + '/htmls/basic.html')); + const content = fs.readFileSync(_dirname + '/htmls/basic.html'); + reply.header('content-length', content.length); + reply.header('content-type', 'text/html'); + return reply.send(content); }); await app.listen({ port }); expect((await summaly(host)).sensitive).toBe(false); @@ -465,9 +523,115 @@ describe('sensitive', () => { test('mixi:content-rating 1', async () => { app = fastify(); app.get('/', (request, reply) => { - return reply.send(fs.createReadStream(_dirname + '/htmls/mixi-sensitive.html')); + const content = fs.readFileSync(_dirname + '/htmls/mixi-sensitive.html'); + reply.header('content-length', content.length); + reply.header('content-type', 'text/html'); + return reply.send(content); }); await app.listen({ port }); expect((await summaly(host)).sensitive).toBe(true); }); }); + +describe('UserAgent', () => { + test('UA設定が反映されていること', async () => { + const content = fs.readFileSync(_dirname + '/htmls/basic.html'); + let ua: string | undefined = undefined; + + app = fastify(); + app.get('/', (request, reply) => { + reply.header('content-length', content.byteLength); + reply.header('content-type', 'text/html'); + ua = request.headers['user-agent']; + return reply.send(content); + }); + await app.listen({ port }); + await summaly(host, { userAgent: 'test-ua' }); + + expect(ua).toBe('test-ua'); + }); +}); + +describe('content-length limit', () => { + test('content-lengthの上限以内であればエラーが起こらないこと', async () => { + const content = fs.readFileSync(_dirname + '/htmls/basic.html'); + + app = fastify(); + app.get('/', (request, reply) => { + reply.header('content-length', content.byteLength); + reply.header('content-type', 'text/html'); + return reply.send(content); + }); + await app.listen({ port }); + + expect(await summaly(host, { contentLengthLimit: content.byteLength })).toBeDefined(); + }); + + test('content-lengthの上限を超えているとエラーになる事', async () => { + const content = fs.readFileSync(_dirname + '/htmls/basic.html'); + + app = fastify(); + app.get('/', (request, reply) => { + reply.header('content-length', content.byteLength); + reply.header('content-type', 'text/html'); + return reply.send(content); + }); + await app.listen({ port }); + + await expect(summaly(host, { contentLengthLimit: content.byteLength - 1 })).rejects.toThrow(); + }); +}); + +describe('content-length required', () => { + test('[オプション有効化時] content-lengthが返された場合はエラーとならないこと', async () => { + const content = fs.readFileSync(_dirname + '/htmls/basic.html'); + + app = fastify(); + app.get('/', (request, reply) => { + reply.header('content-length', content.byteLength); + reply.header('content-type', 'text/html'); + return reply.send(content); + }); + await app.listen({ port }); + + expect(await summaly(host, { contentLengthRequired: true, contentLengthLimit: content.byteLength })).toBeDefined(); + }); + + test('[オプション有効化時] content-lengthが返されない場合はエラーとなること', async () => { + app = fastify(); + app.get('/', (request, reply) => { + reply.header('content-type', 'text/html'); + // streamで渡さないとcontent-lengthを自動で設定されてしまう + return reply.send(fs.createReadStream(_dirname + '/htmls/basic.html')); + }); + await app.listen({ port }); + + await expect(summaly(host, { contentLengthRequired: true })).rejects.toThrow(); + }); + + test('[オプション無効化時] content-lengthが返された場合はエラーとならないこと', async () => { + const content = fs.readFileSync(_dirname + '/htmls/basic.html'); + + app = fastify(); + app.get('/', (request, reply) => { + reply.header('content-length', content.byteLength); + reply.header('content-type', 'text/html'); + return reply.send(content); + }); + await app.listen({ port }); + + expect(await summaly(host, { contentLengthRequired: false, contentLengthLimit: content.byteLength })).toBeDefined(); + }); + + test('[オプション無効化時] content-lengthが返されなくてもエラーとならないこと', async () => { + app = fastify(); + app.get('/', (request, reply) => { + reply.header('content-type', 'text/html'); + // streamで渡さないとcontent-lengthを自動で設定されてしまう + return reply.send(fs.createReadStream(_dirname + '/htmls/basic.html')); + }); + await app.listen({ port }); + + expect(await summaly(host, { contentLengthRequired: false })).toBeDefined(); + }); +});