Python Web Scraper
dataData collection pipeline that fetches records from REST APIs, enriches them with detailed profiles, and stores in SQLite. Features rate limiting, error handling, and incremental data fetching.
python
sqlite
requests
Files
fetch_details.py
1#!/usr/bin/env python3 2""" 3Detail Fetcher 4 5Enriches existing records with additional data from 6individual API endpoints. Supports resumable processing. 7""" 8 9import sqlite3 10import json 11import time 12import requests 13 14DB_PATH = "data/records.db" 15BASE_URL = "https://api.example.com" 16RATE_LIMIT_DELAY = 0.5 17 18 19def get_records_without_details(): 20 """Get records that need detail fetching""" 21 conn = sqlite3.connect(DB_PATH) 22 cursor = conn.cursor() 23 24 cursor.execute(""" 25 SELECT id, slug FROM records 26 WHERE metadata NOT LIKE '%"details_fetched": true%' 27 OR metadata IS NULL 28 ORDER BY created_at DESC 29 """) 30 31 records = cursor.fetchall() 32 conn.close() 33 return records 34 35 36def fetch_record_details(record_id: str): 37 """Fetch detailed data for a single record""" 38 try: 39 response = requests.get( 40 f"{BASE_URL}/records/{record_id}", 41 timeout=30, 42 headers={"Accept": "application/json"} 43 ) 44 response.raise_for_status() 45 return response.json().get("data") 46 47 except requests.RequestException as e: 48 print(f"Error fetching details for {record_id}: {e}") 49 return None 50 51 52def update_record_with_details(record_id: str, details: dict): 53 """Update record with fetched details""" 54 conn = sqlite3.connect(DB_PATH) 55 cursor = conn.cursor() 56 57 # Get existing metadata 58 cursor.execute("SELECT metadata FROM records WHERE id = ?", (record_id,)) 59 row = cursor.fetchone() 60 61 if row and row[0]: 62 metadata = json.loads(row[0]) 63 else: 64 metadata = {} 65 66 # Merge details into metadata 67 metadata.update(details) 68 metadata["details_fetched"] = True 69 70 # Update specific fields if present 71 description = details.get("description") or metadata.get("description") 72 73 cursor.execute(""" 74 UPDATE records 75 SET description = ?, 76 metadata = ?, 77 updated_at = datetime('now') 78 WHERE id = ? 79 """, (description, json.dumps(metadata), record_id)) 80 81 conn.commit() 82 conn.close() 83 84 85def fetch_all_details(): 86 """Process all records that need details""" 87 records = get_records_without_details() 88 total = len(records) 89 90 print(f"Found {total} records needing details") 91 92 for i, (record_id, slug) in enumerate(records, 1): 93 print(f"[{i}/{total}] Fetching: {slug or record_id}") 94 95 details = fetch_record_details(record_id) 96 97 if details: 98 update_record_with_details(record_id, details) 99 print(f" ✓ Updated")100 else:101 print(f" ✗ Failed")102 103 time.sleep(RATE_LIMIT_DELAY)104 105 print(f"\nComplete!")106 107 108if __name__ == "__main__":109 fetch_all_details()