import json import requests from bs4 import BeautifulSoup url = "https://scrape.decodo.com/v1/tasks" payload = { "target": "universal", "url": "https://www.reddit.com/r/aww/comments/11xq5ew/yesterday_i_posted_how_i_was_going_to_adopt/", "headless": "html", "parse": "false" } headers = { "accept": "application/json", "content-type": "application/json", "authorization": "Basic AUTH" } def main(): response = requests.post(url, json=payload, headers=headers) json_data = response.text parsed_data = json.loads(json_data) content = parsed_data['results'][0]['content'] # Strip scraped content from backslashes stripped_content = content.replace('\\', '') htmlopen = open("test.html", "w") htmlopen.write(stripped_content) htmlopen.close() soup = BeautifulSoup(stripped_content, "html.parser") data = [] # Select data points username = soup.find_all('a', class_='_2tbHP6ZydRpjI44J3syuqC _23wugcdiaj44hdfugIAlnX oQctV4n0yUb0uiHDdGnmE') post_timestamp = soup.find_all('span', class_='_2VF2J19pUIMSLJFky-7PEI') post_title = soup.find_all('h1', class_='_eYtD2XCVieq6emjKBH3m') comment_count = soup.find_all('span', class_='FHCV02u6Cp2zYL0fhQPsO') upvote_percentage = soup.find_all('div', class_='t4Hq30BDzTeJ85vREX7_M') subreddit_description = soup.find_all('div', class_='_1zPvgKHteTOub9dKkvrOl4') subreddit_name = soup.find_all('span', class_='_19bCWnxeTjqzBElWZfIlJb') subreddit_date = soup.find_all('span', class_='_1d4NeAxWOiy0JPz7aXRI64') subreddit_members = soup.find_all('div', class_='_3b9utyKN3e_kzVZ5ngPqAu') subreddit_members_online = soup.find_all('div', class_='_21RLQh5PvUhC6vOKoFeHUP') div_tags = soup.find_all('div', class_='_3tw__eCCe7j-epNCKGXUKk') post = { "Username": username[0].text, "PostedAt": post_timestamp[0].text, "PostTitle": post_title[0].text, "CommentCount": comment_count[0].text, "UpvotePercentage": upvote_percentage[0].text, "SubredditDescription": subreddit_description[0].text, "SubredditName": subreddit_name[0].text, "SubredditCreated": subreddit_date[0].text, "SubredditMembers": subreddit_members[0].text, "SubredditMembersOnline":subreddit_members_online[0].text } data.append(post) # Extract data points for div_tag in div_tags: author_tags = div_tag.find_all('a', class_='wM6scouPXXsFDSZmZPHRo DjcdNGtVXPcxG0yiFXIoZ _23wugcdiaj44hdfugIAlnX') author_text = [author_tag.text for author_tag in author_tags] comment_timestamp_tags = div_tag.find_all('a', class_='_3yx4Dn0W3Yunucf5sVJeFU') comment_timestamp_text = [comment_timestamp_tag.text for comment_timestamp_tag in comment_timestamp_tags] comment_url_tags = div_tag.find_all('a', class_='_3yx4Dn0W3Yunucf5sVJeFU', href=True) comment_url_text = [comment_url_tag['href'] for comment_url_tag in comment_url_tags] comment_text_tags = div_tag.find_all('p', class_='_1qeIAgB0cPwnLhDF9XSiJM') comment_text_text = [comment_text_tag.text for comment_text_tag in comment_text_tags] comment_upvotes_tags = div_tag.find_all('div', class_='_1rZYMD_4xY3gRcSS3p8ODO _25IkBM0rRUqWX5ZojEMAFQ _3ChHiOyYyUkpZ_Nm3ZyM2M') comment_upvotes_text = [comment_upvotes_tag.text for comment_upvotes_tag in comment_upvotes_tags] element = { 'CommentAuthorName': author_text, 'CommentDate': comment_timestamp_text, 'CommentURL': comment_url_text, 'CommentText': comment_text_text, 'CommentUpvotes': comment_upvotes_text } data.append(element) # Save data to JSON with open('data.json', 'w') as f: json.dump(data, f) if __name__ == "__main__": main()