Skip to content

Commit 5817f58

Browse files
committed
fix: 优化知网抓取代码
1. 增加异常信息日志输出 2. 优化任务执行,控制任务并行,使用串行检索,避免并发被反爬风控
1 parent e601b35 commit 5817f58

File tree

3 files changed

+48
-44
lines changed

3 files changed

+48
-44
lines changed

src/modules/services/cnki.ts

Lines changed: 7 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -270,55 +270,23 @@ async function getSnapshotItem(
270270
return undefined;
271271
}
272272

273-
// Update addtional information to the item.
274-
// Citations from CNKI, Use keyword: CNKICite
275-
async function updateItem(
276-
item: Zotero.Item | null,
277-
searchResult: ScrapeSearchResult,
278-
): Promise<Zotero.Item | null> {
279-
if (item) {
280-
if (searchResult.citation) {
281-
ztoolkit.ExtraField.setExtraField(
282-
item,
283-
"CNKICite",
284-
`${searchResult.citation}`,
285-
);
286-
}
287-
288-
if (searchResult.netFirst) {
289-
ztoolkit.ExtraField.setExtraField(
290-
item,
291-
"Status",
292-
"advance online publication",
293-
);
294-
}
295-
296-
// Remove unmatched Zotero fields note.
297-
if (item.getNotes().length > 0) {
298-
item.getNotes().forEach(async (nid) => {
299-
const nItem = Zotero.Items.get(nid);
300-
await nItem.eraseTx();
301-
});
302-
}
303-
304-
if (!item.getField("date") && searchResult.date) {
305-
item.setField("date", searchResult.date);
306-
}
307-
}
308-
return item;
309-
}
310-
311273
export class CNKI implements ScrapeService {
312274
async search(
313275
searchOption: SearchOption,
314276
): Promise<ScrapeSearchResult[] | null> {
315277
ztoolkit.log("serch options: ", searchOption);
316278
const postOption = createSearchPostOptions(searchOption);
317279
let responseText: string;
280+
const cookieBox = await addon.data.myCookieSandbox.getCNKIHomeCookieBox();
281+
ztoolkit.log("Cookies in sandbox: ", cookieBox._cookies);
282+
ztoolkit.log(addon.taskRunner.runningTask);
283+
addon.taskRunner.runningTask?.addMsg(
284+
`CNKI site info: ${Object.keys(cookieBox._cookies).length}`,
285+
);
318286
const resp = await Zotero.HTTP.request("POST", postOption.url, {
319287
headers: postOption.headers,
320288
body: postOption.data,
321-
cookieSandbox: await addon.data.myCookieSandbox.getCNKIHomeCookieBox(),
289+
cookieSandbox: cookieBox,
322290
timeout: 10000,
323291
successCodes: [200, 403],
324292
});

src/modules/services/index.ts

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,11 @@ export async function metaTranslate(task: ScraperTask): Promise<void> {
164164

165165
if (translatedItems.length === 1) {
166166
// if (addon.data.env != "development")
167-
const translatedItem = await globalItemFix(task.item, translatedItems[0]);
167+
const translatedItem = await globalItemFix(
168+
task.item,
169+
translatedItems[0],
170+
task,
171+
);
168172
if (task.type == "attachment") {
169173
task.item.parentID = translatedItem.id;
170174
} else if (task.type == "snapshot") {
@@ -203,9 +207,11 @@ export async function metaTranslate(task: ScraperTask): Promise<void> {
203207
}
204208

205209
// Need to update data in item returned by translator.
210+
// Add some extra data.
206211
async function globalItemFix(
207212
oldItem: Zotero.Item,
208213
newItem: Zotero.Item,
214+
task: ScraperTask,
209215
): Promise<Zotero.Item> {
210216
if (Zotero.Prefs.get("extensions.zotero.automaticTags", true)) {
211217
// Keyword tag type is automatic.
@@ -224,5 +230,36 @@ async function globalItemFix(
224230
// Preserve collections
225231
oldItem.getCollections().forEach((cid) => newItem!.addToCollection(cid));
226232
await newItem.saveTx();
233+
234+
// CNKI extra data fix
235+
const searchResult = task.searchResults[task.resultIndex!];
236+
if (searchResult.citation) {
237+
ztoolkit.ExtraField.setExtraField(
238+
newItem,
239+
"CNKICite",
240+
`${searchResult.citation}`,
241+
);
242+
}
243+
244+
if (searchResult.netFirst) {
245+
ztoolkit.ExtraField.setExtraField(
246+
newItem,
247+
"Status",
248+
"advance online publication",
249+
);
250+
}
251+
252+
// Remove unmatched Zotero fields note.
253+
if (newItem.getNotes().length > 0) {
254+
newItem.getNotes().forEach(async (nid) => {
255+
const nItem = Zotero.Items.get(nid);
256+
await nItem.eraseTx();
257+
});
258+
}
259+
260+
if (!newItem.getField("date") && searchResult.date) {
261+
newItem.setField("date", searchResult.date);
262+
}
263+
227264
return newItem;
228265
}

src/utils/task.ts

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -203,8 +203,7 @@ export class TaskRunner {
203203
): Promise<string> {
204204
const task = this.createTask(item, type, silent);
205205
task.addMsg(getString("task-msg-header"));
206-
task.addMsg(`Zotero version: ${Zotero.version}`);
207-
task.addMsg(`Addon version: ${version}`);
206+
task.addMsg(`Zotero: ${Zotero.version}, Jasminum: ${version}`);
208207
await this.addTask(task);
209208
return task.id;
210209
}
@@ -215,9 +214,9 @@ export class TaskRunner {
215214
async runTask(task: AttachmentTask | ScraperTask): Promise<void> {
216215
this.runningTask = task;
217216
if (this.getTaskType(task) === "attachmentScraper") {
218-
this.runAttachmentTask(task as AttachmentTask);
217+
await this.runAttachmentTask(task as AttachmentTask);
219218
} else {
220-
this.runScrapeTask(task as ScraperTask);
219+
await this.runScrapeTask(task as ScraperTask);
221220
}
222221
this.runningTask = null;
223222
}

0 commit comments

Comments
 (0)