Skip to content

Commit 85ea5ab

Browse files
committed
feat: 优化抓取服务逻辑并添加 HeadlessBrowser 支持
1. 优化抓取服务执行顺序 - 调整搜索顺序为:万方 → Yiigle → 知网 - 添加相似度计算,精确匹配时提前终止后续服务 - 知网作为最后兜底方案,减少验证码触发 2. 新增 HeadlessBrowser 工具 - 实现基于 JSWindowActor 的无头浏览器服务 - 支持页面加载、元素等待、表单交互等功能 - 添加 JasminumHeadlessChild Actor 处理页面交互 - 清理旧的 headlessBridge.js 实现 3. 新增万方数据抓取服务 - 实现万方数据的搜索和翻译功能 - 使用 HeadlessBrowser 进行页面抓取 4. 优化 Yiigle 抓取服务 - 迁移到 HeadlessBrowser 实现 - 改进页面加载和数据提取逻辑 5. 更新设置界面和本地化 - 添加相关配置选项 - 更新中英文翻译文件
1 parent 3c1c86f commit 85ea5ab

File tree

18 files changed

+1173
-81
lines changed

18 files changed

+1173
-81
lines changed
Lines changed: 236 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,236 @@
1+
function serializeValue(value) {
2+
if (value === undefined) {
3+
return null;
4+
}
5+
return JSON.parse(JSON.stringify(value));
6+
}
7+
8+
function serializeError(error) {
9+
return {
10+
name: error?.name || "Error",
11+
message: error?.message || String(error),
12+
stack: error?.stack || "",
13+
};
14+
}
15+
16+
function getValueSetter(win, element) {
17+
if (element instanceof win.HTMLTextAreaElement) {
18+
return Object.getOwnPropertyDescriptor(
19+
win.HTMLTextAreaElement.prototype,
20+
"value",
21+
)?.set;
22+
}
23+
if (element instanceof win.HTMLInputElement) {
24+
return Object.getOwnPropertyDescriptor(
25+
win.HTMLInputElement.prototype,
26+
"value",
27+
)?.set;
28+
}
29+
return null;
30+
}
31+
32+
function waitForDocumentReady(document, { allowInteractiveAfter = 0 } = {}) {
33+
if (!document) {
34+
return Promise.resolve();
35+
}
36+
37+
const readyState = document.readyState;
38+
if (readyState === "complete") {
39+
return Promise.resolve();
40+
}
41+
if (readyState === "interactive" && allowInteractiveAfter !== false) {
42+
return new Promise((resolve) => {
43+
setTimeout(resolve, allowInteractiveAfter || 0);
44+
});
45+
}
46+
47+
return new Promise((resolve) => {
48+
const onReadyStateChange = () => {
49+
const state = document.readyState;
50+
if (state === "complete") {
51+
cleanup();
52+
resolve();
53+
return;
54+
}
55+
if (state === "interactive" && allowInteractiveAfter !== false) {
56+
cleanup();
57+
setTimeout(resolve, allowInteractiveAfter || 0);
58+
}
59+
};
60+
61+
const cleanup = () => {
62+
document.removeEventListener("readystatechange", onReadyStateChange, true);
63+
document.removeEventListener("DOMContentLoaded", onReadyStateChange, true);
64+
document.defaultView?.removeEventListener("load", onReadyStateChange, true);
65+
};
66+
67+
document.addEventListener("readystatechange", onReadyStateChange, true);
68+
document.addEventListener("DOMContentLoaded", onReadyStateChange, true);
69+
document.defaultView?.addEventListener("load", onReadyStateChange, true);
70+
onReadyStateChange();
71+
});
72+
}
73+
74+
export class JasminumHeadlessChild extends JSWindowActorChild {
75+
_getWindow() {
76+
if (!this.contentWindow) {
77+
throw new Error("Headless actor contentWindow is not available");
78+
}
79+
return this.contentWindow;
80+
}
81+
82+
_getDocument() {
83+
const doc = this.document;
84+
if (!doc) {
85+
throw new Error("Headless actor document is not available");
86+
}
87+
return doc;
88+
}
89+
90+
_getElement(selector) {
91+
const element = this._getDocument().querySelector(selector);
92+
if (!element) {
93+
throw new Error(`Element not found for selector: ${selector}`);
94+
}
95+
return element;
96+
}
97+
98+
async receiveMessage(message) {
99+
const doc = this.document;
100+
if (doc) {
101+
await waitForDocumentReady(doc, { allowInteractiveAfter: 0 });
102+
}
103+
104+
switch (message.name) {
105+
case "evaluate":
106+
return this._evaluate(message.data || {});
107+
108+
case "click":
109+
return this._click(message.data || {});
110+
111+
case "fill":
112+
return this._fill(message.data || {});
113+
114+
case "press":
115+
return this._press(message.data || {});
116+
}
117+
118+
throw new Error(`Unknown headless actor message: ${message.name}`);
119+
}
120+
121+
async _evaluate(payload) {
122+
try {
123+
const fn = eval(`(${payload.source})`);
124+
if (typeof fn !== "function") {
125+
throw new Error("evaluate() expects a function source");
126+
}
127+
const value = await fn(
128+
this._getWindow(),
129+
this._getDocument(),
130+
...(payload.args || []),
131+
);
132+
return {
133+
ok: true,
134+
value: serializeValue(value),
135+
};
136+
}
137+
catch (error) {
138+
return {
139+
ok: false,
140+
error: serializeError(error),
141+
};
142+
}
143+
}
144+
145+
_click(payload) {
146+
const win = this._getWindow();
147+
const element = this._getElement(payload.selector);
148+
element.scrollIntoView?.({ block: "center", inline: "center" });
149+
element.click();
150+
return {
151+
url: win.location.href,
152+
};
153+
}
154+
155+
_fill(payload) {
156+
const win = this._getWindow();
157+
const element = this._getElement(payload.selector);
158+
159+
if (
160+
!(
161+
element instanceof win.HTMLInputElement
162+
|| element instanceof win.HTMLTextAreaElement
163+
|| element instanceof win.HTMLSelectElement
164+
)
165+
) {
166+
throw new Error(`Element is not a form control: ${payload.selector}`);
167+
}
168+
169+
element.focus?.();
170+
171+
if (element instanceof win.HTMLSelectElement) {
172+
element.value = payload.value;
173+
}
174+
else {
175+
const setter = getValueSetter(win, element);
176+
const nextValue = payload.value == null ? "" : String(payload.value);
177+
if (payload.clear !== false && setter) {
178+
setter.call(element, "");
179+
}
180+
if (setter) {
181+
setter.call(element, nextValue);
182+
}
183+
else {
184+
element.value = nextValue;
185+
}
186+
}
187+
188+
element.dispatchEvent(
189+
new win.Event("input", { bubbles: true, cancelable: true }),
190+
);
191+
if (payload.dispatchChange !== false) {
192+
element.dispatchEvent(
193+
new win.Event("change", { bubbles: true, cancelable: true }),
194+
);
195+
}
196+
if (payload.blur) {
197+
element.blur?.();
198+
}
199+
200+
return {
201+
value: element.value,
202+
url: win.location.href,
203+
};
204+
}
205+
206+
_press(payload) {
207+
const win = this._getWindow();
208+
const doc = this._getDocument();
209+
const target = payload.selector
210+
? this._getElement(payload.selector)
211+
: doc.activeElement || doc.body;
212+
213+
target?.focus?.();
214+
215+
const eventInit = {
216+
key: payload.key,
217+
code: payload.code || payload.key,
218+
keyCode: payload.keyCode || (payload.key === "Enter" ? 13 : 0),
219+
which: payload.keyCode || (payload.key === "Enter" ? 13 : 0),
220+
bubbles: true,
221+
cancelable: true,
222+
};
223+
224+
for (const type of ["keydown", "keypress", "keyup"]) {
225+
target?.dispatchEvent(new win.KeyboardEvent(type, eventInit));
226+
}
227+
228+
if (payload.key === "Enter" && target instanceof win.HTMLInputElement) {
229+
target.form?.requestSubmit?.();
230+
}
231+
232+
return {
233+
url: win.location.href,
234+
};
235+
}
236+
}

addon/chrome/content/preferences-main.xhtml

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@
5454
data-l10n-id="namepattern-desc"
5555
></image>
5656
</hbox>
57-
<!-- <hbox align="center">
57+
<hbox align="center">
5858
<label
5959
for="zotero-prefpane-__addonRef__-metadata-source"
6060
data-l10n-id="label-metadata-source"
@@ -81,12 +81,23 @@
8181
<checkbox
8282
class="metadata-drop-item"
8383
native="true"
84-
value="CVIP"
85-
data-l10n-id="label-metadata-source-cvip"
84+
value="WanFangData"
85+
data-l10n-id="label-metadata-source-wanfangdata"
86+
/>
87+
<checkbox
88+
class="metadata-drop-item"
89+
native="true"
90+
value="Yiigle"
91+
data-l10n-id="label-metadata-source-yiigle"
8692
/>
8793
</html:div>
8894
</html:div>
89-
</hbox> -->
95+
<image
96+
class="help-icon"
97+
src="chrome://jasminum/content/icons/help.svg"
98+
data-l10n-id="metadata-source-desc"
99+
></image>
100+
</hbox>
90101
</groupbox>
91102
<!-- 转换器 -->
92103
<groupbox>

addon/locale/en-US/addon.ftl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,6 @@ bookmark-delete = Delete bookmark
7474
# Progress window
7575
task-msg-header =
7676
Please screenshot the following content and contact the developer: [RedBook l0o0](https://www.xiaohongshu.com/user/profile/6153b4fa000000001f03ac8c)
77-
If you don't get a timely response, you can get free remote assistance on Taobao. Please look for the official store: [Contact via WangWang](https://item.taobao.com/item.htm?ft=t&id=1035769863393)
77+
If you don't get a timely response, you can get free consultation on Taobao. Please look for the official store: [Contact via WangWang](https://item.taobao.com/item.htm?ft=t&id=1035769863393)
7878
You can also open the QR dialog directly here: [Show QR Code](jasminum://remote-help-qr)
7979
task-already-exists = Task already exists: { $title }

addon/locale/en-US/preferences-main.ftl

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,13 @@ label-choose-source =
2828
.label = Select Data Source
2929
label-metadata-source-cnki =
3030
.label = CNKI (China National Knowledge Infrastructure)
31-
label-metadata-source-cvip =
32-
.label = VIP Journals (Chinese VIP Information)
31+
label-metadata-source-wanfangdata =
32+
.label = Wanfang Data
33+
label-metadata-source-yiigle =
34+
.label = Yiigle (Chinese Medical Journals)
35+
metadata-source-desc =
36+
.tooltiptext = Select the source for metadata retrieval. Generally, there is no need to switch. If you find some data sources unnecessary, you can uncheck them.
37+
3338
label-pdf-match-folder = Attachment Matching Folder
3439
label-choose-folder =
3540
.label = Select Folder

addon/locale/zh-CN/addon.ftl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,5 +75,5 @@ bookmark-delete = 删除书签
7575
# Progress window
7676
task-msg-header =
7777
请截图该弹窗并联系开发者:[小红书l0o0](https://www.xiaohongshu.com/user/profile/6153b4fa000000001f03ac8c)
78-
如回复未及时,也可直接点击这里免费呼唤远程协助:[查看二维码](jasminum://remote-help-qr)
78+
如回复未及时,也可直接点击这里免费咨询:[查看二维码](jasminum://remote-help-qr)
7979
task-already-exists = 任务已存在:{ $title }

addon/locale/zh-CN/preferences-main.ftl

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,13 @@ label-choose-source =
2828
.label = 选择数据源
2929
label-metadata-source-cnki =
3030
.label = 中国知网CNKI
31-
label-metadata-source-cvip =
32-
.label = 维普期刊CVIP
31+
label-metadata-source-wanfangdata =
32+
.label = 万方数据WanfangData
33+
label-metadata-source-yiigle =
34+
.label = 中华医学期刊Yiigle
35+
metadata-source-desc =
36+
.tooltiptext = 选择元数据抓取来源,一般情况下不用切换。如果你觉得有些数据源不需要,可以取消勾选。
37+
3338
3439
namepattern-desc =
3540
.tooltiptext = 根据文件名抓取知网元数据,文件名格式设置:

addon/locale/zh-TW/addon.ftl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,6 @@ bookmark-delete = 刪除書籤
7575
# Progress window
7676
task-msg-header =
7777
如果抓取異常需要幫助,請截圖以下內容並聯繫開發者:[小紅書l0o0](https://www.xiaohongshu.com/user/profile/6153b4fa000000001f03ac8c)
78-
如回复未及时,可在淘宝免费远程协助,请认准官方店铺:[点击旺旺联系](https://item.taobao.com/item.htm?ft=t&id=1035769863393)
78+
如回复未及时,可在淘宝免费咨询,请认准官方店铺:[点击旺旺联系](https://item.taobao.com/item.htm?ft=t&id=1035769863393)
7979
也可直接點擊這裡打開二維碼:[查看二維碼](jasminum://remote-help-qr)
8080
task-already-exists = 已存在任務:{ $title }

addon/locale/zh-TW/preferences-main.ftl

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,13 @@ label-choose-source =
2828
.label = 選擇資料來源
2929
label-metadata-source-cnki =
3030
.label = 中國知網CNKI
31-
label-metadata-source-cvip =
32-
.label = 維普期刊CVIP
31+
label-metadata-source-wanfangdata =
32+
.label = 万方数据WanfangData
33+
label-metadata-source-yiigle =
34+
.label = 中华医学期刊Yiigle
35+
metadata-source-desc =
36+
.tooltiptext = 選擇元數據抓取來源,一般情況下不用切換。如果你覺得有些資料來源不需要,可以取消勾選。
37+
3338
label-pdf-match-folder = 附件匹配資料夾
3439
label-choose-folder =
3540
.label = 選擇資料夾

addon/prefs.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ pref("language", "zh");
99
pref("autoUpdateMetadata", true);
1010
pref("namePattern", "{%t}_{%g}");
1111
pref("namePatternCustom", "{%t}");
12-
pref("metadataSource", "CNKI");
12+
pref("metadataSource", "CNKI, WanFangData, Yiigle");
1313
pref("isMainlandChina", true);
1414
pref("cnkiAttachmentCookie", "");
1515
pref("similarityThresholdForMetaData", "0.6");

src/addon.ts

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,20 @@ import { getOutlineFromPDF } from "./modules/outline/outline";
77
import { TaskRunner } from "./utils/task";
88
import { requestDocument } from "./utils/http";
99
import { openRemoteHelpDialog } from "./modules/preferences/remoteHelp";
10+
import {
11+
HeadlessBrowserOptions,
12+
HeadlessBrowserService,
13+
} from "./utils/headlessBrowser";
14+
15+
export type AddonAPI = {
16+
getOutlineFromPDF: typeof getOutlineFromPDF;
17+
requestDocument: typeof requestDocument;
18+
openRemoteHelpDialog: typeof openRemoteHelpDialog;
19+
HeadlessBrowserService: typeof HeadlessBrowserService;
20+
createHeadlessBrowser: (
21+
options?: HeadlessBrowserOptions,
22+
) => HeadlessBrowserService;
23+
};
1024

1125
class Addon {
1226
public data: {
@@ -36,7 +50,7 @@ class Addon {
3650
// Lifecycle hooks
3751
public hooks: typeof hooks;
3852
// APIs
39-
public api: object;
53+
public api: AddonAPI;
4054
public taskRunner: TaskRunner;
4155

4256
constructor() {
@@ -55,7 +69,14 @@ class Addon {
5569
isImportingAttachments: false,
5670
};
5771
this.hooks = hooks;
58-
this.api = { getOutlineFromPDF, requestDocument, openRemoteHelpDialog };
72+
this.api = {
73+
getOutlineFromPDF,
74+
requestDocument,
75+
openRemoteHelpDialog,
76+
HeadlessBrowserService,
77+
createHeadlessBrowser: (options?: HeadlessBrowserOptions) =>
78+
new HeadlessBrowserService(options),
79+
};
5980
this.taskRunner = new TaskRunner();
6081
}
6182
}

0 commit comments

Comments
 (0)