Skip to content

Commit

Permalink
Merge pull request #1 from lobehub/feat/new
Browse files Browse the repository at this point in the history
✨ feat: refactor with new version
  • Loading branch information
arvinxx authored Aug 17, 2023
2 parents 8762b3d + c3c798d commit 2f055ce
Show file tree
Hide file tree
Showing 6 changed files with 96 additions and 36 deletions.
19 changes: 0 additions & 19 deletions api/parse.ts

This file was deleted.

42 changes: 42 additions & 0 deletions api/v1/_utils.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import { Readability } from '@mozilla/readability';
import { JSDOM } from 'jsdom';
import { NodeHtmlMarkdown } from 'node-html-markdown';

const BASE_URL = process.env.BROWSERLESS_URL ?? 'https://chrome.browserless.io';
const BROWSERLESS_TOKEN = process.env.BROWSERLESS_TOKEN;

export const htmlToMarkdown = (html: string, url: string) => {
const doc = new JSDOM(html, { url });

const article = new Readability(doc.window.document).parse();
const content = NodeHtmlMarkdown.translate(article?.content || '', {});

return { ...article, content };
};

const runner = async ({ url }: { url: string }) => {
const input = {
gotoOptions: { waitUntil: 'networkidle2' },
url,
};

try {
const res = await fetch(`${BASE_URL}/content?token=${BROWSERLESS_TOKEN}`, {
body: JSON.stringify(input),
headers: {
'Content-Type': 'application/json',
},
method: 'POST',
});
const html = await res.text();

const article = htmlToMarkdown(html, url);

return { content: article.content, title: article?.title, url, website: article?.siteName };
} catch (error) {
console.error(error);
return { content: '抓取失败', errorMessage: (error as any).message, url };
}
};

export default runner;
13 changes: 13 additions & 0 deletions api/v1/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import type { VercelRequest, VercelResponse } from '@vercel/node';

import fetchContent from './_utils';

export default async function handler(req: VercelRequest, res: VercelResponse) {
if (req.method !== 'POST') {
res.status(405);
}

const result = await fetchContent(req.body);

res.send(result);
}
35 changes: 35 additions & 0 deletions api/v1/type.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
export type Result = {
content: string;
title?: string;
url: string;
website?: string;
};

export interface ParserResponse {
/** author metadata */
byline: string;

/** HTML string of processed article content */
content: string;

/** content direction */
dir: string;

/** article description, or short excerpt from the content */
excerpt: string;

/** content language */
lang: string;

/** length of an article, in characters */
length: number;

/** name of the site */
siteName: string;

/** text content of the article, with all the HTML tags removed */
textContent: string;

/** article title */
title: string;
}
6 changes: 3 additions & 3 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
{
"name": "lobehub-html-parser",
"name": "@lobehub/chat-plugin-web-crawler",
"version": "1.0.1",
"private": true,
"description": "HTML 转 markdown 服务",
"repository": "https://github.com/arvinxx/vercel-serverless-api-template.git",
"description": "Lobe Chat 网页抓取服务",
"repository": "https://github.com/lobehub/chat-plugin-web-crawler.git",
"scripts": {
"ci": "npm run lint && npm run type-check",
"lint": "npm run lint:js && npm run lint:prettier",
Expand Down
17 changes: 3 additions & 14 deletions tests/parse.test.ts
Original file line number Diff line number Diff line change
@@ -1,25 +1,14 @@
import { VercelRequest, VercelResponse } from '@vercel/node';
import { readFileSync } from 'node:fs';
import * as path from 'node:path';

import { expect } from 'vitest';
import Api from '../api/parse';
import { htmlToMarkdown } from '../api/v1/_utils';

describe('html-to-markdown', () => {
it('Zhihu', async () => {
it('Zhihu', () => {
const html = readFileSync(path.join(__dirname, './html/zhihu.html'), { encoding: 'utf8' });

const data = await Api(
<VercelRequest>(<unknown>{
body: {
html,
url: 'https://zhuanlan.zhihu.com/p/641434725',
},
}),
<VercelResponse>(<unknown>{
send: () => {},
}),
);
const data = htmlToMarkdown(html, 'https://zhuanlan.zhihu.com/p/641434725');

expect(data).toMatchSnapshot();
});
Expand Down

0 comments on commit 2f055ce

Please sign in to comment.