Python Web Scraper

data

Data collection pipeline that fetches records from REST APIs, enriches them with detailed profiles, and stores in SQLite. Features rate limiting, error handling, and incremental data fetching.

python sqlite requests
Files
fetch_details.py
1#!/usr/bin/env python3
2"""
3Detail Fetcher
4 
5Enriches existing records with additional data from
6individual API endpoints. Supports resumable processing.
7"""
8 
9import sqlite3
10import json
11import time
12import requests
13 
14DB_PATH = "data/records.db"
15BASE_URL = "https://api.example.com"
16RATE_LIMIT_DELAY = 0.5
17 
18 
19def get_records_without_details():
20 """Get records that need detail fetching"""
21 conn = sqlite3.connect(DB_PATH)
22 cursor = conn.cursor()
23 
24 cursor.execute("""
25 SELECT id, slug FROM records
26 WHERE metadata NOT LIKE '%"details_fetched": true%'
27 OR metadata IS NULL
28 ORDER BY created_at DESC
29 """)
30 
31 records = cursor.fetchall()
32 conn.close()
33 return records
34 
35 
36def fetch_record_details(record_id: str):
37 """Fetch detailed data for a single record"""
38 try:
39 response = requests.get(
40 f"{BASE_URL}/records/{record_id}",
41 timeout=30,
42 headers={"Accept": "application/json"}
43 )
44 response.raise_for_status()
45 return response.json().get("data")
46 
47 except requests.RequestException as e:
48 print(f"Error fetching details for {record_id}: {e}")
49 return None
50 
51 
52def update_record_with_details(record_id: str, details: dict):
53 """Update record with fetched details"""
54 conn = sqlite3.connect(DB_PATH)
55 cursor = conn.cursor()
56 
57 # Get existing metadata
58 cursor.execute("SELECT metadata FROM records WHERE id = ?", (record_id,))
59 row = cursor.fetchone()
60 
61 if row and row[0]:
62 metadata = json.loads(row[0])
63 else:
64 metadata = {}
65 
66 # Merge details into metadata
67 metadata.update(details)
68 metadata["details_fetched"] = True
69 
70 # Update specific fields if present
71 description = details.get("description") or metadata.get("description")
72 
73 cursor.execute("""
74 UPDATE records
75 SET description = ?,
76 metadata = ?,
77 updated_at = datetime('now')
78 WHERE id = ?
79 """, (description, json.dumps(metadata), record_id))
80 
81 conn.commit()
82 conn.close()
83 
84 
85def fetch_all_details():
86 """Process all records that need details"""
87 records = get_records_without_details()
88 total = len(records)
89 
90 print(f"Found {total} records needing details")
91 
92 for i, (record_id, slug) in enumerate(records, 1):
93 print(f"[{i}/{total}] Fetching: {slug or record_id}")
94 
95 details = fetch_record_details(record_id)
96 
97 if details:
98 update_record_with_details(record_id, details)
99 print(f" ✓ Updated")
100 else:
101 print(f" ✗ Failed")
102 
103 time.sleep(RATE_LIMIT_DELAY)
104 
105 print(f"\nComplete!")
106 
107 
108if __name__ == "__main__":
109 fetch_all_details()
Chat with me 👋🏻
William

Ask William

Available to chat