|
| 1 | +class ScheduledTask(models.Models): |
| 2 | + id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) |
| 3 | + |
| 4 | + name = models.CharField(max_length=128, required=True, unique=True) |
| 5 | + schedule = models.CharField(max_length=32, default='weekly') |
| 6 | + enabled = models.BooleanField(default=True) |
| 7 | + |
| 8 | + added = models.DateTimeField(auto_now_add=True) |
| 9 | + updated = models.DateTimeField(auto_now=True) |
| 10 | + |
| 11 | + runs = models.IntegerField(default=0, min_value=0, editable=False) |
| 12 | + last_output = models.CharField(max_length=1024, default='') |
| 13 | + last_start_ts = models.DateTimeField(default=None, null=True, blank=True) |
| 14 | + last_end_ts = models.DateTimeField(default=None, null=True, blank=True) |
| 15 | + |
| 16 | + class Meta: |
| 17 | + abstract = True |
| 18 | + |
| 19 | + |
| 20 | +class ScheduledAdd(ScheduledTask): |
| 21 | + # main task parameters |
| 22 | + urls = models.TextField(max_length=4096, default='', db_index=True) |
| 23 | + tag = models.ManyToManyField(Tag) |
| 24 | + |
| 25 | + # add behavior flags |
| 26 | + depth = models.IntegerField(min_value=0, max_value=1, default=0) |
| 27 | + resnapshot = models.BooleanField(default=False) |
| 28 | + overwrite = models.BooleanField(default=False) |
| 29 | + index_only = models.BooleanField(default=False) |
| 30 | + update_all = models.BooleanField(default=False) |
| 31 | + |
| 32 | + extractors = models.CSVField(max_length=256, default='') |
| 33 | + parser = models.CharField(max_length=32, default='auto', choices=PARSER_CHOICES) |
| 34 | + |
| 35 | + @cached_property |
| 36 | + def source_filename(self) -> str: |
| 37 | + return f'{self.short_id}-scheduled-import.txt' |
| 38 | + |
| 39 | + def save(self, **kwargs): |
| 40 | + self.urls_str = self.urls_str.strip() |
| 41 | + |
| 42 | + assert self.urls_str or self.update_all, ( |
| 43 | + 'you must either pass some urls to import, or set the task to update' |
| 44 | + ' all existing URLS, otherwise it will do nothing') |
| 45 | + |
| 46 | + assert self.schedule in ('hour', 'day', 'week', 'month', 'year') or isValidCronSchedule(self.schedule) |
| 47 | + |
| 48 | + assert not (self.overwrite and self.resnapshot), ( |
| 49 | + 'When snapshotting a URL thats previously snapshotted, ' |
| 50 | + 'you may either overwrite it, or re-snapshot it, but not both') |
| 51 | + |
| 52 | + # some more validation here... |
| 53 | + save_text_as_source(self.urls, filename=self.source_filename) |
| 54 | + self.schedule() |
| 55 | + |
| 56 | + def schedule(self): |
| 57 | + method = 'system crontab' if USE_SYSTEM_CRON else 'archivebox scheduler' |
| 58 | + print(f'[*] Scheduling import {self.name} to run every {self.schedule} using {method}') |
| 59 | + |
| 60 | + # TODO: decide whether to support system cron at all, or enforce python scheduler |
| 61 | + if USE_SYSTEM_CRON: |
| 62 | + schedule( |
| 63 | + every=self.schedule, |
| 64 | + depth=self.depth, |
| 65 | + overwrite=self.overwrite, |
| 66 | + import_path=self.urls_source_file_path, |
| 67 | + ) |
| 68 | + else: |
| 69 | + # TODO: update yacron/celery/huey/APScheduler etc. whatever scheduler we choose |
| 70 | + pass |
| 71 | + |
| 72 | + def run(self, force: bool=False): |
| 73 | + if (not self.enabled) and (not force): |
| 74 | + print(f'[!] Refusing to run scheduled import that is disabled: {self.name}') |
| 75 | + return None |
| 76 | + |
| 77 | + # TODO: enforce "at most once" or "at least once" concurrency somehow |
| 78 | + |
| 79 | + print(f'[+] [{timezone.now().isoformat()}] Running scheduled import {self.name}...\n') |
| 80 | + |
| 81 | + self.last_start_ts = timezone.now() |
| 82 | + self.runs += 1 |
| 83 | + try: |
| 84 | + all_links, new_links = add( |
| 85 | + urls=Path(self.urls_source_file_path), |
| 86 | + tag=self.tag, |
| 87 | + depth=self.depth, |
| 88 | + update_all=self.update_all, |
| 89 | + index_only=self.index_only, |
| 90 | + overwrite=self.overwrite, |
| 91 | + extractors=self.extractors, |
| 92 | + parser=self.parser, |
| 93 | + ) |
| 94 | + self.last_output = f'SUCCEEDED: {len(new_links)} new snapshots ({len(all_links)} total snapshots)' |
| 95 | + except BaseException as err: |
| 96 | + self.last_output = f'FAILED: {err.__class__.__name__} {err}' |
| 97 | + |
| 98 | + self.last_end_ts = timezone.now() |
| 99 | + self.save() |
0 commit comments