46 Beautiful Soup (WebPage)

import requests
from bs4 import BeautifulSoup

import requests
from bs4 import BeautifulSoup
import chardet


def fetch_urlsoup(url, timeout=10):
    """Download HTML content from a URL and return a BeautifulSoup object."""
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
    }
    try:
        # Try to download the webpage
        response = requests.get(url, timeout=timeout, headers=headers)
        response.raise_for_status()

        # Auto-detect encoding
        detected = chardet.detect(response.content)
        response.encoding = detected["encoding"]

        # Return BeautifulSoup object if successful
        return BeautifulSoup(response.text, "html.parser")

    except requests.exceptions.HTTPError as e:
        print(f"HTTP Error: {e}")
    except requests.exceptions.ConnectionError as e:
        print(f"Connection Error: {e}")
    except requests.exceptions.Timeout as e:
        print(f"Timeout Error: {e}")
    except requests.exceptions.RequestException as e:
        print(f"Request Exception: {e}")

    # Return None if any exception occurred
    return None

soup_ex = fetch_urlsoup("https://example.com")

soup_ex

<!DOCTYPE html>

<html>
<head>
<title>Example Domain</title>
<meta charset="utf-8"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-type"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<style type="text/css">
    body {
        background-color: #f0f0f2;
        margin: 0;
        padding: 0;
        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
        
    }
    div {
        width: 600px;
        margin: 5em auto;
        padding: 2em;
        background-color: #fdfdff;
        border-radius: 0.5em;
        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);
    }
    a:link, a:visited {
        color: #38488f;
        text-decoration: none;
    }
    @media (max-width: 700px) {
        div {
            margin: 0 auto;
            width: auto;
        }
    }
    </style>
</head>
<body>
<div>
<h1>Example Domain</h1>
<p>This domain is for use in illustrative examples in documents. You may use this
    domain in literature without prior coordination or asking for permission.</p>
<p><a href="https://www.iana.org/domains/example">More information...</a></p>
</div>
</body>
</html>

print(soup_ex.title.text)  # Print the page title

Example Domain

# Extract all URLs
for link in soup_ex.find_all('a'):
    print(link.get('href'))

https://www.iana.org/domains/example

soup_ex.get_text()

'\n\n\nExample Domain\n\n\n\n\n\n\n\nExample Domain\nThis domain is for use in illustrative examples in documents. You may use this\n    domain in literature without prior coordination or asking for permission.\nMore information...\n\n\n\n'

46.1 rPoster Scrape

import pandas as pd

soup_rposter = fetch_urlsoup("https://radiopaedia.org/courses/rposter")

link_df = pd.DataFrame({
    "href": [link.get('href') for link in soup_rposter.find_all('a')],
    "string": [link.string for link in soup_rposter.find_all('a')]
    })
link_df.head()

	href	string
0	#	None
1	/edits?lang=us	Recent Edits
2	/sessions/new?lang=us	Log In
3	/articles?lang=us	Articles
4	/users/sign_up?lang=us	Sign Up

link_df_mod = (
    link_df
    .pipe(lambda df: df[df["href"].str.contains("/courses/rposter/pages", regex = True, na=False)])
    .pipe(lambda df: df.assign(href_full = "https://radiopaedia.org" + df["href"]))
)
link_df_mod.head()

	href	string	href_full
77	/courses/rposter/pages/2409	10 Must-Know In Abdominal Imaging	https://radiopaedia.org/courses/rposter/pages/...
78	/courses/rposter/pages/2480	A Beginner's Guide To: Perianal Fistulas	https://radiopaedia.org/courses/rposter/pages/...
79	/courses/rposter/pages/2489	An Overview Of Polyposis Syndromes	https://radiopaedia.org/courses/rposter/pages/...
80	/courses/rposter/pages/2416	Appendicitis: A Declassified Guide To Imaging	https://radiopaedia.org/courses/rposter/pages/...
81	/courses/rposter/pages/2438	Bowel Ultrasound	https://radiopaedia.org/courses/rposter/pages/...

46.1.0.1 Get Download URL

import time
soup_pdf_url = {}

for index, row in link_df_mod[0:2].iterrows():
    url = row["href_full"]
    title = row["string"]
    try:
        soup_dl_page_url = fetch_urlsoup(url)
        soup_pdf_url[title] = [link.get('href') 
                                for link in soup_dl_page_url.find_all('a') 
                                if link.string == "DOWNLOAD PDF"][0]
        time.sleep(1)
    except Exception as e:
        print(f"Error: {e}")

soup_pdf_url

{'10 Must-Know In Abdominal Imaging': 'https://prod-images-static.radiopaedia.org/page_images/5054/R24-004_10_must-know_in_abdominal_imaging.pdf',
 "A Beginner's Guide To: Perianal Fistulas": 'https://prod-images-static.radiopaedia.org/page_images/5100/R24-259_A_Beginner_s_Guide_to_Perianal_Fistulas.pdf'}

46.1.1 Download PDF

from pathlib import Path

def download_pdf(url, output_path = None):
    """Download a PDF file from a URL and save it to the specified path."""
    # Send GET request
    response = requests.get(url, stream=True)
    response.raise_for_status()  # Raise an exception for bad responses
    
    if output_path is None:
        output_path = Path(url).name
    
    # Write content to file
    with open(output_path, 'wb') as file:
        for chunk in response.iter_content(chunk_size=8192):
            file.write(chunk)
    
    return output_path

download_pdf(soup_pdf_url["10 Must-Know In Abdominal Imaging"])

'R24-004_10_must-know_in_abdominal_imaging.pdf'