diff --git a/.gitignore b/.gitignore index f661b91..6edd850 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,2 @@ /node_modules -/built npm-debug.log diff --git a/built/general.d.ts b/built/general.d.ts new file mode 100644 index 0000000..95fe685 --- /dev/null +++ b/built/general.d.ts @@ -0,0 +1,4 @@ +import * as URL from 'node:url'; +import Summary from './summary.js'; +declare const _default: (url: URL.Url, lang?: string | null) => Promise; +export default _default; diff --git a/built/general.js b/built/general.js new file mode 100644 index 0000000..ac1d711 --- /dev/null +++ b/built/general.js @@ -0,0 +1,98 @@ +import * as URL from 'node:url'; +import clip from './utils/clip.js'; +import cleanupTitle from './utils/cleanup-title.js'; +import { decode as decodeHtml } from 'html-entities'; +import { head, scpaping } from './utils/got.js'; +export default async (url, lang = null) => { + if (lang && !lang.match(/^[\w-]+(\s*,\s*[\w-]+)*$/)) + lang = null; + const res = await scpaping(url.href, { lang: lang || undefined }); + const $ = res.$; + const twitterCard = $('meta[property="twitter:card"]').attr('content'); + let title = $('meta[property="og:title"]').attr('content') || + $('meta[property="twitter:title"]').attr('content') || + $('title').text(); + if (title === undefined || title === null) { + return null; + } + title = clip(decodeHtml(title), 100); + let image = $('meta[property="og:image"]').attr('content') || + $('meta[property="twitter:image"]').attr('content') || + $('link[rel="image_src"]').attr('href') || + $('link[rel="apple-touch-icon"]').attr('href') || + $('link[rel="apple-touch-icon image_src"]').attr('href'); + image = image ? URL.resolve(url.href, image) : null; + const playerUrl = (twitterCard !== 'summary_large_image' && $('meta[property="twitter:player"]').attr('content')) || + (twitterCard !== 'summary_large_image' && $('meta[name="twitter:player"]').attr('content')) || + $('meta[property="og:video"]').attr('content') || + $('meta[property="og:video:secure_url"]').attr('content') || + $('meta[property="og:video:url"]').attr('content'); + const playerWidth = parseInt($('meta[property="twitter:player:width"]').attr('content') || + $('meta[name="twitter:player:width"]').attr('content') || + $('meta[property="og:video:width"]').attr('content') || + ''); + const playerHeight = parseInt($('meta[property="twitter:player:height"]').attr('content') || + $('meta[name="twitter:player:height"]').attr('content') || + $('meta[property="og:video:height"]').attr('content') || + ''); + let description = $('meta[property="og:description"]').attr('content') || + $('meta[property="twitter:description"]').attr('content') || + $('meta[name="description"]').attr('content'); + description = description + ? clip(decodeHtml(description), 300) + : null; + if (title === description) { + description = null; + } + let siteName = $('meta[property="og:site_name"]').attr('content') || + $('meta[name="application-name"]').attr('content') || + url.hostname; + siteName = siteName ? decodeHtml(siteName) : null; + const favicon = $('link[rel="shortcut icon"]').attr('href') || + $('link[rel="icon"]').attr('href') || + '/favicon.ico'; + const sensitive = $('.tweet').attr('data-possibly-sensitive') === 'true'; + const find = async (path) => { + const target = URL.resolve(url.href, path); + try { + await head(target); + return target; + } + catch (e) { + return null; + } + }; + // 相対的なURL (ex. test) を絶対的 (ex. /test) に変換 + const toAbsolute = (relativeURLString) => { + const relativeURL = URL.parse(relativeURLString); + const isAbsolute = relativeURL.slashes || relativeURL.path !== null && relativeURL.path[0] === '/'; + // 既に絶対的なら、即座に値を返却 + if (isAbsolute) { + return relativeURLString; + } + // スラッシュを付けて返却 + return '/' + relativeURLString; + }; + const icon = await find(favicon) || + // 相対指定を絶対指定に変換し再試行 + await find(toAbsolute(favicon)) || + null; + // Clean up the title + title = cleanupTitle(title, siteName); + if (title === '') { + title = siteName; + } + return { + title: title || null, + icon: icon || null, + description: description || null, + thumbnail: image || null, + player: { + url: playerUrl || null, + width: Number.isNaN(playerWidth) ? null : playerWidth, + height: Number.isNaN(playerHeight) ? null : playerHeight + }, + sitename: siteName || null, + sensitive, + }; +}; diff --git a/built/index.d.ts b/built/index.d.ts new file mode 100644 index 0000000..f3e2f03 --- /dev/null +++ b/built/index.d.ts @@ -0,0 +1,39 @@ +/** + * summaly + * https://github.com/syuilo/summaly + */ +import Summary from './summary.js'; +import type { IPlugin as _IPlugin } from './iplugin.js'; +export declare type IPlugin = _IPlugin; +import * as Got from 'got'; +import type { FastifyInstance } from 'fastify'; +declare type Options = { + /** + * Accept-Language for the request + */ + lang?: string | null; + /** + * Whether follow redirects + */ + followRedirects?: boolean; + /** + * Custom Plugins + */ + plugins?: IPlugin[]; + /** + * Custom HTTP agent + */ + agent?: Got.Agents; +}; +declare type Result = Summary & { + /** + * The actual url of that web page + */ + url: string; +}; +/** + * Summarize an web page + */ +export declare const summaly: (url: string, options?: Options | undefined) => Promise; +export default function (fastify: FastifyInstance, options: Options, done: (err?: Error) => void): void; +export {}; diff --git a/built/index.js b/built/index.js new file mode 100644 index 0000000..8a6524f --- /dev/null +++ b/built/index.js @@ -0,0 +1,68 @@ +/** + * summaly + * https://github.com/syuilo/summaly + */ +import * as URL from 'node:url'; +import tracer from 'trace-redirect'; +import general from './general.js'; +import { setAgent } from './utils/got.js'; +import { plugins as builtinPlugins } from './plugins/index.js'; +const defaultOptions = { + lang: null, + followRedirects: true, + plugins: [], +}; +/** + * Summarize an web page + */ +export const summaly = async (url, options) => { + if (options?.agent) + setAgent(options.agent); + const opts = Object.assign(defaultOptions, options); + const plugins = builtinPlugins.concat(opts.plugins || []); + let actualUrl = url; + if (opts.followRedirects) { + // .catch(() => url)にすればいいけど、jestにtrace-redirectを食わせるのが面倒なのでtry-catch + try { + actualUrl = await tracer(url); + } + catch (e) { + actualUrl = url; + } + } + const _url = URL.parse(actualUrl, true); + // Find matching plugin + const match = plugins.filter(plugin => plugin.test(_url))[0]; + // Get summary + const summary = await (match ? match.summarize : general)(_url, opts.lang || undefined); + if (summary == null) { + throw 'failed summarize'; + } + return Object.assign(summary, { + url: actualUrl + }); +}; +export default function (fastify, options, done) { + fastify.get('/url', async (req, reply) => { + const url = req.query.url; + if (url == null) { + return reply.status(400).send({ + error: 'url is required' + }); + } + try { + const summary = await summaly(url, { + lang: req.query.lang, + followRedirects: false, + ...options, + }); + return summary; + } + catch (e) { + return reply.status(500).send({ + error: e + }); + } + }); + done(); +} diff --git a/built/iplugin.d.ts b/built/iplugin.d.ts new file mode 100644 index 0000000..ba5028b --- /dev/null +++ b/built/iplugin.d.ts @@ -0,0 +1,7 @@ +/// +import * as URL from 'node:url'; +import Summary from './summary.js'; +export interface IPlugin { + test: (url: URL.Url) => boolean; + summarize: (url: URL.Url, lang?: string) => Promise; +} diff --git a/built/iplugin.js b/built/iplugin.js new file mode 100644 index 0000000..cb0ff5c --- /dev/null +++ b/built/iplugin.js @@ -0,0 +1 @@ +export {}; diff --git a/built/plugins/amazon.d.ts b/built/plugins/amazon.d.ts new file mode 100644 index 0000000..04a2d3d --- /dev/null +++ b/built/plugins/amazon.d.ts @@ -0,0 +1,5 @@ +/// +import * as URL from 'node:url'; +import summary from '../summary.js'; +export declare function test(url: URL.Url): boolean; +export declare function summarize(url: URL.Url): Promise; diff --git a/built/plugins/amazon.js b/built/plugins/amazon.js new file mode 100644 index 0000000..3d329d9 --- /dev/null +++ b/built/plugins/amazon.js @@ -0,0 +1,43 @@ +import { scpaping } from '../utils/got.js'; +export function test(url) { + return url.hostname === 'www.amazon.com' || + url.hostname === 'www.amazon.co.jp' || + url.hostname === 'www.amazon.ca' || + url.hostname === 'www.amazon.com.br' || + url.hostname === 'www.amazon.com.mx' || + url.hostname === 'www.amazon.co.uk' || + url.hostname === 'www.amazon.de' || + url.hostname === 'www.amazon.fr' || + url.hostname === 'www.amazon.it' || + url.hostname === 'www.amazon.es' || + url.hostname === 'www.amazon.nl' || + url.hostname === 'www.amazon.cn' || + url.hostname === 'www.amazon.in' || + url.hostname === 'www.amazon.au'; +} +export async function summarize(url) { + const res = await scpaping(url.href); + const $ = res.$; + const title = $('#title').text(); + const description = $('#productDescription').text() || + $('meta[name="description"]').attr('content'); + const thumbnail = $('#landingImage').attr('src'); + const playerUrl = $('meta[property="twitter:player"]').attr('content') || + $('meta[name="twitter:player"]').attr('content'); + const playerWidth = $('meta[property="twitter:player:width"]').attr('content') || + $('meta[name="twitter:player:width"]').attr('content'); + const playerHeight = $('meta[property="twitter:player:height"]').attr('content') || + $('meta[name="twitter:player:height"]').attr('content'); + return { + title: title ? title.trim() : null, + icon: 'https://www.amazon.com/favicon.ico', + description: description ? description.trim() : null, + thumbnail: thumbnail ? thumbnail.trim() : null, + player: { + url: playerUrl || null, + width: playerWidth ? parseInt(playerWidth) : null, + height: playerHeight ? parseInt(playerHeight) : null + }, + sitename: 'Amazon' + }; +} diff --git a/built/plugins/index.d.ts b/built/plugins/index.d.ts new file mode 100644 index 0000000..780a2ad --- /dev/null +++ b/built/plugins/index.d.ts @@ -0,0 +1,2 @@ +import { IPlugin } from '@/iplugin.js'; +export declare const plugins: IPlugin[]; diff --git a/built/plugins/index.js b/built/plugins/index.js new file mode 100644 index 0000000..13de039 --- /dev/null +++ b/built/plugins/index.js @@ -0,0 +1,6 @@ +import * as amazon from './amazon.js'; +import * as wikipedia from './wikipedia.js'; +export const plugins = [ + amazon, + wikipedia, +]; diff --git a/built/plugins/wikipedia.d.ts b/built/plugins/wikipedia.d.ts new file mode 100644 index 0000000..04a2d3d --- /dev/null +++ b/built/plugins/wikipedia.d.ts @@ -0,0 +1,5 @@ +/// +import * as URL from 'node:url'; +import summary from '../summary.js'; +export declare function test(url: URL.Url): boolean; +export declare function summarize(url: URL.Url): Promise; diff --git a/built/plugins/wikipedia.js b/built/plugins/wikipedia.js new file mode 100644 index 0000000..bb38703 --- /dev/null +++ b/built/plugins/wikipedia.js @@ -0,0 +1,36 @@ +import { get } from '../utils/got.js'; +import debug from 'debug'; +import clip from './../utils/clip.js'; +const log = debug('summaly:plugins:wikipedia'); +export function test(url) { + if (!url.hostname) + return false; + return /\.wikipedia\.org$/.test(url.hostname); +} +export async function summarize(url) { + const lang = url.host ? url.host.split('.')[0] : null; + const title = url.pathname ? url.pathname.split('/')[2] : null; + const endpoint = `https://${lang}.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&exintro=&explaintext=&titles=${title}`; + log(`lang is ${lang}`); + log(`title is ${title}`); + log(`endpoint is ${endpoint}`); + let body = await get(endpoint); + body = JSON.parse(body); + log(body); + if (!('query' in body) || !('pages' in body.query)) { + throw 'fetch failed'; + } + const info = body.query.pages[Object.keys(body.query.pages)[0]]; + return { + title: info.title, + icon: 'https://wikipedia.org/static/favicon/wikipedia.ico', + description: clip(info.extract, 300), + thumbnail: `https://wikipedia.org/static/images/project-logos/${lang}wiki.png`, + player: { + url: null, + width: null, + height: null + }, + sitename: 'Wikipedia' + }; +} diff --git a/built/server/index.d.ts b/built/server/index.d.ts new file mode 100644 index 0000000..cb0ff5c --- /dev/null +++ b/built/server/index.d.ts @@ -0,0 +1 @@ +export {}; diff --git a/built/server/index.js b/built/server/index.js new file mode 100644 index 0000000..e32567a --- /dev/null +++ b/built/server/index.js @@ -0,0 +1,22 @@ +import * as http from 'http'; +import * as Koa from 'koa'; +import summaly from '../'; +const app = new Koa(); +app.use(async (ctx) => { + if (!ctx.query.url) { + ctx.status = 400; + return; + } + try { + const summary = await summaly(ctx.query.url, { + lang: ctx.query.lang, + followRedirects: false + }); + ctx.body = summary; + } + catch (e) { + ctx.status = 500; + } +}); +const server = http.createServer(app.callback()); +server.listen(process.env.PORT || 80); diff --git a/built/summary.d.ts b/built/summary.d.ts new file mode 100644 index 0000000..e343f80 --- /dev/null +++ b/built/summary.d.ts @@ -0,0 +1,45 @@ +declare type Summary = { + /** + * The description of that web page + */ + description: string | null; + /** + * The url of the icon of that web page + */ + icon: string | null; + /** + * The name of site of that web page + */ + sitename: string | null; + /** + * The url of the thumbnail of that web page + */ + thumbnail: string | null; + /** + * The player of that web page + */ + player: Player; + /** + * The title of that web page + */ + title: string | null; + /** + * Possibly sensitive + */ + sensitive?: boolean; +}; +export default Summary; +export declare type Player = { + /** + * The url of the player + */ + url: string | null; + /** + * The width of the player + */ + width: number | null; + /** + * The height of the player + */ + height: number | null; +}; diff --git a/built/summary.js b/built/summary.js new file mode 100644 index 0000000..cb0ff5c --- /dev/null +++ b/built/summary.js @@ -0,0 +1 @@ +export {}; diff --git a/built/utils/cleanup-title.d.ts b/built/utils/cleanup-title.d.ts new file mode 100644 index 0000000..a45a0d4 --- /dev/null +++ b/built/utils/cleanup-title.d.ts @@ -0,0 +1 @@ +export default function (title: string, siteName?: string | null): string; diff --git a/built/utils/cleanup-title.js b/built/utils/cleanup-title.js new file mode 100644 index 0000000..3855410 --- /dev/null +++ b/built/utils/cleanup-title.js @@ -0,0 +1,19 @@ +import escapeRegExp from 'escape-regexp'; +export default function (title, siteName) { + title = title.trim(); + if (siteName) { + siteName = siteName.trim(); + const x = escapeRegExp(siteName); + const patterns = [ + `^(.+?)\\s?[\\-\\|:・]\\s?${x}$` + ]; + for (let i = 0; i < patterns.length; i++) { + const pattern = new RegExp(patterns[i]); + const [, match] = pattern.exec(title) || [null, null]; + if (match) { + return match; + } + } + } + return title; +} diff --git a/built/utils/clip.d.ts b/built/utils/clip.d.ts new file mode 100644 index 0000000..ba0a174 --- /dev/null +++ b/built/utils/clip.d.ts @@ -0,0 +1 @@ +export default function (s: string, max: number): string; diff --git a/built/utils/clip.js b/built/utils/clip.js new file mode 100644 index 0000000..1e614ce --- /dev/null +++ b/built/utils/clip.js @@ -0,0 +1,13 @@ +import nullOrEmpty from './null-or-empty.js'; +export default function (s, max) { + if (nullOrEmpty(s)) { + return s; + } + s = s.trim(); + if (s.length > max) { + return s.substr(0, max) + '...'; + } + else { + return s; + } +} diff --git a/built/utils/encoding.d.ts b/built/utils/encoding.d.ts new file mode 100644 index 0000000..e9c4617 --- /dev/null +++ b/built/utils/encoding.d.ts @@ -0,0 +1,8 @@ +/// +/** + * Detect HTML encoding + * @param body Body in Buffer + * @returns encoding + */ +export declare function detectEncoding(body: Buffer): string; +export declare function toUtf8(body: Buffer, encoding: string): string; diff --git a/built/utils/encoding.js b/built/utils/encoding.js new file mode 100644 index 0000000..a8ee858 --- /dev/null +++ b/built/utils/encoding.js @@ -0,0 +1,40 @@ +import iconv from 'iconv-lite'; +import jschardet from 'jschardet'; +const regCharset = new RegExp(/charset\s*=\s*["']?([\w-]+)/, 'i'); +/** + * Detect HTML encoding + * @param body Body in Buffer + * @returns encoding + */ +export function detectEncoding(body) { + // By detection + const detected = jschardet.detect(body, { minimumThreshold: 0.99 }); + if (detected) { + const candicate = detected.encoding; + const encoding = toEncoding(candicate); + if (encoding != null) + return encoding; + } + // From meta + const matchMeta = body.toString('ascii').match(regCharset); + if (matchMeta) { + const candicate = matchMeta[1]; + const encoding = toEncoding(candicate); + if (encoding != null) + return encoding; + } + return 'utf-8'; +} +export function toUtf8(body, encoding) { + return iconv.decode(body, encoding); +} +function toEncoding(candicate) { + if (iconv.encodingExists(candicate)) { + if (['shift_jis', 'shift-jis', 'windows-31j', 'x-sjis'].includes(candicate.toLowerCase())) + return 'cp932'; + return candicate; + } + else { + return null; + } +} diff --git a/built/utils/got.d.ts b/built/utils/got.d.ts new file mode 100644 index 0000000..9b70686 --- /dev/null +++ b/built/utils/got.d.ts @@ -0,0 +1,20 @@ +import * as Got from 'got'; +import * as cheerio from 'cheerio'; +export declare let agent: Got.Agents; +export declare function setAgent(_agent: Got.Agents): void; +export declare type GotOptions = { + url: string; + method: 'GET' | 'POST' | 'HEAD'; + body?: string; + headers: Record; + typeFilter?: RegExp; +}; +export declare function scpaping(url: string, opts?: { + lang?: string; +}): Promise<{ + body: string; + $: cheerio.CheerioAPI; + response: Got.Response; +}>; +export declare function get(url: string): Promise; +export declare function head(url: string): Promise>; diff --git a/built/utils/got.js b/built/utils/got.js new file mode 100644 index 0000000..f2f73bc --- /dev/null +++ b/built/utils/got.js @@ -0,0 +1,123 @@ +import got, * as Got from 'got'; +import { StatusError } from './status-error.js'; +import { detectEncoding, toUtf8 } from './encoding.js'; +import * as cheerio from 'cheerio'; +import PrivateIp from 'private-ip'; +import { dirname } from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { readFileSync } from 'node:fs'; +const _filename = fileURLToPath(import.meta.url); +const _dirname = dirname(_filename); +export let agent = {}; +export function setAgent(_agent) { + agent = _agent || {}; +} +const repo = JSON.parse(readFileSync(`${_dirname}/../../package.json`, 'utf8')); +const RESPONSE_TIMEOUT = 20 * 1000; +const OPERATION_TIMEOUT = 60 * 1000; +const MAX_RESPONSE_SIZE = 10 * 1024 * 1024; +const BOT_UA = `SummalyBot/${repo.version}`; +export async function scpaping(url, opts) { + const response = await getResponse({ + url, + method: 'GET', + headers: { + 'accept': 'text/html,application/xhtml+xml', + 'user-agent': BOT_UA, + 'accept-language': opts?.lang + }, + typeFilter: /^(text\/html|application\/xhtml\+xml)/, + }); + // テスト用 + const allowPrivateIp = process.env.SUMMALY_ALLOW_PRIVATE_IP === 'true'; + if (!allowPrivateIp && response.ip && PrivateIp(response.ip)) { + throw new StatusError(`Private IP rejected ${response.ip}`, 400, 'Private IP Rejected'); + } + const encoding = detectEncoding(response.rawBody); + const body = toUtf8(response.rawBody, encoding); + const $ = cheerio.load(body); + return { + body, + $, + response, + }; +} +export async function get(url) { + const res = await getResponse({ + url, + method: 'GET', + headers: { + 'accept': '*/*', + }, + }); + return await res.body; +} +export async function head(url) { + const res = await getResponse({ + url, + method: 'HEAD', + headers: { + 'accept': '*/*', + }, + }); + return await res; +} +async function getResponse(args) { + const timeout = RESPONSE_TIMEOUT; + const operationTimeout = OPERATION_TIMEOUT; + const req = got(args.url, { + method: args.method, + headers: args.headers, + body: args.body, + timeout: { + lookup: timeout, + connect: timeout, + secureConnect: timeout, + socket: timeout, + response: timeout, + send: timeout, + request: operationTimeout, // whole operation timeout + }, + agent, + http2: false, + retry: { + limit: 0, + }, + }); + return await receiveResponce({ req, typeFilter: args.typeFilter }); +} +async function receiveResponce(args) { + const req = args.req; + const maxSize = MAX_RESPONSE_SIZE; + req.on('response', (res) => { + // Check html + if (args.typeFilter && !res.headers['content-type']?.match(args.typeFilter)) { + req.cancel(`Rejected by type filter ${res.headers['content-type']}`); + return; + } + // 応答ヘッダでサイズチェック + const contentLength = res.headers['content-length']; + if (contentLength != null) { + const size = Number(contentLength); + if (size > maxSize) { + req.cancel(`maxSize exceeded (${size} > ${maxSize}) on response`); + } + } + }); + // 受信中のデータでサイズチェック + req.on('downloadProgress', (progress) => { + if (progress.transferred > maxSize && progress.percent !== 1) { + req.cancel(`maxSize exceeded (${progress.transferred} > ${maxSize}) on response`); + } + }); + // 応答取得 with ステータスコードエラーの整形 + const res = await req.catch(e => { + if (e instanceof Got.HTTPError) { + throw new StatusError(`${e.response.statusCode} ${e.response.statusMessage}`, e.response.statusCode, e.response.statusMessage); + } + else { + throw e; + } + }); + return res; +} diff --git a/built/utils/null-or-empty.d.ts b/built/utils/null-or-empty.d.ts new file mode 100644 index 0000000..d19ff5c --- /dev/null +++ b/built/utils/null-or-empty.d.ts @@ -0,0 +1 @@ +export default function (val: string): boolean; diff --git a/built/utils/null-or-empty.js b/built/utils/null-or-empty.js new file mode 100644 index 0000000..078e6a3 --- /dev/null +++ b/built/utils/null-or-empty.js @@ -0,0 +1,14 @@ +export default function (val) { + if (val === undefined) { + return true; + } + else if (val === null) { + return true; + } + else if (val.trim() === '') { + return true; + } + else { + return false; + } +} diff --git a/built/utils/status-error.d.ts b/built/utils/status-error.d.ts new file mode 100644 index 0000000..fe06108 --- /dev/null +++ b/built/utils/status-error.d.ts @@ -0,0 +1,6 @@ +export declare class StatusError extends Error { + statusCode: number; + statusMessage?: string; + isPermanentError: boolean; + constructor(message: string, statusCode: number, statusMessage?: string); +} diff --git a/built/utils/status-error.js b/built/utils/status-error.js new file mode 100644 index 0000000..d394e0c --- /dev/null +++ b/built/utils/status-error.js @@ -0,0 +1,9 @@ +export class StatusError extends Error { + constructor(message, statusCode, statusMessage) { + super(message); + this.name = 'StatusError'; + this.statusCode = statusCode; + this.statusMessage = statusMessage; + this.isPermanentError = typeof this.statusCode === 'number' && this.statusCode >= 400 && this.statusCode < 500; + } +} diff --git a/package.json b/package.json index 53c4125..5a278bc 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "summaly", - "version": "3.0.0-alpha.1", + "version": "3.0.1", "description": "Get web page's summary", "author": "syuilo ", "license": "MIT",