Skip to content

Commit 940b9fe

Browse files
committed
add beginnings of new scheduler model for recurring imports
1 parent c2f2f4f commit 940b9fe

File tree

1 file changed

+99
-0
lines changed

1 file changed

+99
-0
lines changed

archivebox/scheduler/models.py

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
class ScheduledTask(models.Models):
2+
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
3+
4+
name = models.CharField(max_length=128, required=True, unique=True)
5+
schedule = models.CharField(max_length=32, default='weekly')
6+
enabled = models.BooleanField(default=True)
7+
8+
added = models.DateTimeField(auto_now_add=True)
9+
updated = models.DateTimeField(auto_now=True)
10+
11+
runs = models.IntegerField(default=0, min_value=0, editable=False)
12+
last_output = models.CharField(max_length=1024, default='')
13+
last_start_ts = models.DateTimeField(default=None, null=True, blank=True)
14+
last_end_ts = models.DateTimeField(default=None, null=True, blank=True)
15+
16+
class Meta:
17+
abstract = True
18+
19+
20+
class ScheduledAdd(ScheduledTask):
21+
# main task parameters
22+
urls = models.TextField(max_length=4096, default='', db_index=True)
23+
tag = models.ManyToManyField(Tag)
24+
25+
# add behavior flags
26+
depth = models.IntegerField(min_value=0, max_value=1, default=0)
27+
resnapshot = models.BooleanField(default=False)
28+
overwrite = models.BooleanField(default=False)
29+
index_only = models.BooleanField(default=False)
30+
update_all = models.BooleanField(default=False)
31+
32+
extractors = models.CSVField(max_length=256, default='')
33+
parser = models.CharField(max_length=32, default='auto', choices=PARSER_CHOICES)
34+
35+
@cached_property
36+
def source_filename(self) -> str:
37+
return f'{self.short_id}-scheduled-import.txt'
38+
39+
def save(self, **kwargs):
40+
self.urls_str = self.urls_str.strip()
41+
42+
assert self.urls_str or self.update_all, (
43+
'you must either pass some urls to import, or set the task to update'
44+
' all existing URLS, otherwise it will do nothing')
45+
46+
assert self.schedule in ('hour', 'day', 'week', 'month', 'year') or isValidCronSchedule(self.schedule)
47+
48+
assert not (self.overwrite and self.resnapshot), (
49+
'When snapshotting a URL thats previously snapshotted, '
50+
'you may either overwrite it, or re-snapshot it, but not both')
51+
52+
# some more validation here...
53+
save_text_as_source(self.urls, filename=self.source_filename)
54+
self.schedule()
55+
56+
def schedule(self):
57+
method = 'system crontab' if USE_SYSTEM_CRON else 'archivebox scheduler'
58+
print(f'[*] Scheduling import {self.name} to run every {self.schedule} using {method}')
59+
60+
# TODO: decide whether to support system cron at all, or enforce python scheduler
61+
if USE_SYSTEM_CRON:
62+
schedule(
63+
every=self.schedule,
64+
depth=self.depth,
65+
overwrite=self.overwrite,
66+
import_path=self.urls_source_file_path,
67+
)
68+
else:
69+
# TODO: update yacron/celery/huey/APScheduler etc. whatever scheduler we choose
70+
pass
71+
72+
def run(self, force: bool=False):
73+
if (not self.enabled) and (not force):
74+
print(f'[!] Refusing to run scheduled import that is disabled: {self.name}')
75+
return None
76+
77+
# TODO: enforce "at most once" or "at least once" concurrency somehow
78+
79+
print(f'[+] [{timezone.now().isoformat()}] Running scheduled import {self.name}...\n')
80+
81+
self.last_start_ts = timezone.now()
82+
self.runs += 1
83+
try:
84+
all_links, new_links = add(
85+
urls=Path(self.urls_source_file_path),
86+
tag=self.tag,
87+
depth=self.depth,
88+
update_all=self.update_all,
89+
index_only=self.index_only,
90+
overwrite=self.overwrite,
91+
extractors=self.extractors,
92+
parser=self.parser,
93+
)
94+
self.last_output = f'SUCCEEDED: {len(new_links)} new snapshots ({len(all_links)} total snapshots)'
95+
except BaseException as err:
96+
self.last_output = f'FAILED: {err.__class__.__name__} {err}'
97+
98+
self.last_end_ts = timezone.now()
99+
self.save()

0 commit comments

Comments
 (0)