import requests
from bs4 import BeautifulSoup
46 Beautiful Soup (WebPage)
import requests
from bs4 import BeautifulSoup
import chardet
def fetch_urlsoup(url, timeout=10):
"""Download HTML content from a URL and return a BeautifulSoup object."""
= {
headers "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
}try:
# Try to download the webpage
= requests.get(url, timeout=timeout, headers=headers)
response
response.raise_for_status()
# Auto-detect encoding
= chardet.detect(response.content)
detected = detected["encoding"]
response.encoding
# Return BeautifulSoup object if successful
return BeautifulSoup(response.text, "html.parser")
except requests.exceptions.HTTPError as e:
print(f"HTTP Error: {e}")
except requests.exceptions.ConnectionError as e:
print(f"Connection Error: {e}")
except requests.exceptions.Timeout as e:
print(f"Timeout Error: {e}")
except requests.exceptions.RequestException as e:
print(f"Request Exception: {e}")
# Return None if any exception occurred
return None
= fetch_urlsoup("https://example.com") soup_ex
soup_ex
<!DOCTYPE html>
<html>
<head>
<title>Example Domain</title>
<meta charset="utf-8"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-type"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<style type="text/css">
body {
background-color: #f0f0f2;
margin: 0;
padding: 0;
font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
}
div {
width: 600px;
margin: 5em auto;
padding: 2em;
background-color: #fdfdff;
border-radius: 0.5em;
box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);
}
a:link, a:visited {
color: #38488f;
text-decoration: none;
}
@media (max-width: 700px) {
div {
margin: 0 auto;
width: auto;
}
}
</style>
</head>
<body>
<div>
<h1>Example Domain</h1>
<p>This domain is for use in illustrative examples in documents. You may use this
domain in literature without prior coordination or asking for permission.</p>
<p><a href="https://www.iana.org/domains/example">More information...</a></p>
</div>
</body>
</html>
print(soup_ex.title.text) # Print the page title
Example Domain
# Extract all URLs
for link in soup_ex.find_all('a'):
print(link.get('href'))
https://www.iana.org/domains/example
soup_ex.get_text()
'\n\n\nExample Domain\n\n\n\n\n\n\n\nExample Domain\nThis domain is for use in illustrative examples in documents. You may use this\n domain in literature without prior coordination or asking for permission.\nMore information...\n\n\n\n'
46.1 rPoster Scrape
import pandas as pd
= fetch_urlsoup("https://radiopaedia.org/courses/rposter") soup_rposter
= pd.DataFrame({
link_df "href": [link.get('href') for link in soup_rposter.find_all('a')],
"string": [link.string for link in soup_rposter.find_all('a')]
}) link_df.head()
href | string | |
---|---|---|
0 | # | None |
1 | /edits?lang=us | Recent Edits |
2 | /sessions/new?lang=us | Log In |
3 | /articles?lang=us | Articles |
4 | /users/sign_up?lang=us | Sign Up |
= (
link_df_mod
link_dflambda df: df[df["href"].str.contains("/courses/rposter/pages", regex = True, na=False)])
.pipe(lambda df: df.assign(href_full = "https://radiopaedia.org" + df["href"]))
.pipe(
) link_df_mod.head()
href | string | href_full | |
---|---|---|---|
77 | /courses/rposter/pages/2409 | 10 Must-Know In Abdominal Imaging | https://radiopaedia.org/courses/rposter/pages/... |
78 | /courses/rposter/pages/2480 | A Beginner's Guide To: Perianal Fistulas | https://radiopaedia.org/courses/rposter/pages/... |
79 | /courses/rposter/pages/2489 | An Overview Of Polyposis Syndromes | https://radiopaedia.org/courses/rposter/pages/... |
80 | /courses/rposter/pages/2416 | Appendicitis: A Declassified Guide To Imaging | https://radiopaedia.org/courses/rposter/pages/... |
81 | /courses/rposter/pages/2438 | Bowel Ultrasound | https://radiopaedia.org/courses/rposter/pages/... |
46.1.0.1 Get Download URL
import time
= {}
soup_pdf_url
for index, row in link_df_mod[0:2].iterrows():
= row["href_full"]
url = row["string"]
title try:
= fetch_urlsoup(url)
soup_dl_page_url = [link.get('href')
soup_pdf_url[title] for link in soup_dl_page_url.find_all('a')
if link.string == "DOWNLOAD PDF"][0]
1)
time.sleep(except Exception as e:
print(f"Error: {e}")
soup_pdf_url
{'10 Must-Know In Abdominal Imaging': 'https://prod-images-static.radiopaedia.org/page_images/5054/R24-004_10_must-know_in_abdominal_imaging.pdf',
"A Beginner's Guide To: Perianal Fistulas": 'https://prod-images-static.radiopaedia.org/page_images/5100/R24-259_A_Beginner_s_Guide_to_Perianal_Fistulas.pdf'}
46.1.1 Download PDF
from pathlib import Path
def download_pdf(url, output_path = None):
"""Download a PDF file from a URL and save it to the specified path."""
# Send GET request
= requests.get(url, stream=True)
response # Raise an exception for bad responses
response.raise_for_status()
if output_path is None:
= Path(url).name
output_path
# Write content to file
with open(output_path, 'wb') as file:
for chunk in response.iter_content(chunk_size=8192):
file.write(chunk)
return output_path
"10 Must-Know In Abdominal Imaging"]) download_pdf(soup_pdf_url[
'R24-004_10_must-know_in_abdominal_imaging.pdf'