import requests
from bs4 import BeautifulSoup53 Beautiful Soup (WebPage)
import requests
from bs4 import BeautifulSoup
import chardet
def fetch_urlsoup(url, timeout=10):
"""Download HTML content from a URL and return a BeautifulSoup object."""
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
}
try:
# Try to download the webpage
response = requests.get(url, timeout=timeout, headers=headers)
response.raise_for_status()
# Auto-detect encoding
detected = chardet.detect(response.content)
response.encoding = detected["encoding"]
# Return BeautifulSoup object if successful
return BeautifulSoup(response.text, "html.parser")
except requests.exceptions.HTTPError as e:
print(f"HTTP Error: {e}")
except requests.exceptions.ConnectionError as e:
print(f"Connection Error: {e}")
except requests.exceptions.Timeout as e:
print(f"Timeout Error: {e}")
except requests.exceptions.RequestException as e:
print(f"Request Exception: {e}")
# Return None if any exception occurred
return Nonesoup_ex = fetch_urlsoup("https://example.com")soup_ex<!DOCTYPE html>
<html>
<head>
<title>Example Domain</title>
<meta charset="utf-8"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-type"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<style type="text/css">
body {
background-color: #f0f0f2;
margin: 0;
padding: 0;
font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
}
div {
width: 600px;
margin: 5em auto;
padding: 2em;
background-color: #fdfdff;
border-radius: 0.5em;
box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);
}
a:link, a:visited {
color: #38488f;
text-decoration: none;
}
@media (max-width: 700px) {
div {
margin: 0 auto;
width: auto;
}
}
</style>
</head>
<body>
<div>
<h1>Example Domain</h1>
<p>This domain is for use in illustrative examples in documents. You may use this
domain in literature without prior coordination or asking for permission.</p>
<p><a href="https://www.iana.org/domains/example">More information...</a></p>
</div>
</body>
</html>
print(soup_ex.title.text) # Print the page titleExample Domain
# Extract all URLs
for link in soup_ex.find_all('a'):
print(link.get('href'))https://www.iana.org/domains/example
soup_ex.get_text()'\n\n\nExample Domain\n\n\n\n\n\n\n\nExample Domain\nThis domain is for use in illustrative examples in documents. You may use this\n domain in literature without prior coordination or asking for permission.\nMore information...\n\n\n\n'
53.1 rPoster Scrape
import pandas as pdsoup_rposter = fetch_urlsoup("https://radiopaedia.org/courses/rposter")link_df = pd.DataFrame({
"href": [link.get('href') for link in soup_rposter.find_all('a')],
"string": [link.string for link in soup_rposter.find_all('a')]
})
link_df.head()| href | string | |
|---|---|---|
| 0 | # | None |
| 1 | /edits?lang=us | Recent Edits |
| 2 | /sessions/new?lang=us | Log In |
| 3 | /articles?lang=us | Articles |
| 4 | /users/sign_up?lang=us | Sign Up |
link_df_mod = (
link_df
.pipe(lambda df: df[df["href"].str.contains("/courses/rposter/pages", regex = True, na=False)])
.pipe(lambda df: df.assign(href_full = "https://radiopaedia.org" + df["href"]))
)
link_df_mod.head()| href | string | href_full | |
|---|---|---|---|
| 77 | /courses/rposter/pages/2409 | 10 Must-Know In Abdominal Imaging | https://radiopaedia.org/courses/rposter/pages/... |
| 78 | /courses/rposter/pages/2480 | A Beginner's Guide To: Perianal Fistulas | https://radiopaedia.org/courses/rposter/pages/... |
| 79 | /courses/rposter/pages/2489 | An Overview Of Polyposis Syndromes | https://radiopaedia.org/courses/rposter/pages/... |
| 80 | /courses/rposter/pages/2416 | Appendicitis: A Declassified Guide To Imaging | https://radiopaedia.org/courses/rposter/pages/... |
| 81 | /courses/rposter/pages/2438 | Bowel Ultrasound | https://radiopaedia.org/courses/rposter/pages/... |
53.1.0.1 Get Download URL
import time
soup_pdf_url = {}
for index, row in link_df_mod[0:2].iterrows():
url = row["href_full"]
title = row["string"]
try:
soup_dl_page_url = fetch_urlsoup(url)
soup_pdf_url[title] = [link.get('href')
for link in soup_dl_page_url.find_all('a')
if link.string == "DOWNLOAD PDF"][0]
time.sleep(1)
except Exception as e:
print(f"Error: {e}")soup_pdf_url{'10 Must-Know In Abdominal Imaging': 'https://prod-images-static.radiopaedia.org/page_images/5054/R24-004_10_must-know_in_abdominal_imaging.pdf',
"A Beginner's Guide To: Perianal Fistulas": 'https://prod-images-static.radiopaedia.org/page_images/5100/R24-259_A_Beginner_s_Guide_to_Perianal_Fistulas.pdf'}
53.1.1 Download PDF
from pathlib import Path
def download_pdf(url, output_path = None):
"""Download a PDF file from a URL and save it to the specified path."""
# Send GET request
response = requests.get(url, stream=True)
response.raise_for_status() # Raise an exception for bad responses
if output_path is None:
output_path = Path(url).name
# Write content to file
with open(output_path, 'wb') as file:
for chunk in response.iter_content(chunk_size=8192):
file.write(chunk)
return output_pathdownload_pdf(soup_pdf_url["10 Must-Know In Abdominal Imaging"])'R24-004_10_must-know_in_abdominal_imaging.pdf'