46  Beautiful Soup (WebPage)

import requests
from bs4 import BeautifulSoup
import requests
from bs4 import BeautifulSoup
import chardet


def fetch_urlsoup(url, timeout=10):
    """Download HTML content from a URL and return a BeautifulSoup object."""
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
    }
    try:
        # Try to download the webpage
        response = requests.get(url, timeout=timeout, headers=headers)
        response.raise_for_status()

        # Auto-detect encoding
        detected = chardet.detect(response.content)
        response.encoding = detected["encoding"]

        # Return BeautifulSoup object if successful
        return BeautifulSoup(response.text, "html.parser")

    except requests.exceptions.HTTPError as e:
        print(f"HTTP Error: {e}")
    except requests.exceptions.ConnectionError as e:
        print(f"Connection Error: {e}")
    except requests.exceptions.Timeout as e:
        print(f"Timeout Error: {e}")
    except requests.exceptions.RequestException as e:
        print(f"Request Exception: {e}")

    # Return None if any exception occurred
    return None
soup_ex = fetch_urlsoup("https://example.com")
soup_ex
<!DOCTYPE html>

<html>
<head>
<title>Example Domain</title>
<meta charset="utf-8"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-type"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<style type="text/css">
    body {
        background-color: #f0f0f2;
        margin: 0;
        padding: 0;
        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
        
    }
    div {
        width: 600px;
        margin: 5em auto;
        padding: 2em;
        background-color: #fdfdff;
        border-radius: 0.5em;
        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);
    }
    a:link, a:visited {
        color: #38488f;
        text-decoration: none;
    }
    @media (max-width: 700px) {
        div {
            margin: 0 auto;
            width: auto;
        }
    }
    </style>
</head>
<body>
<div>
<h1>Example Domain</h1>
<p>This domain is for use in illustrative examples in documents. You may use this
    domain in literature without prior coordination or asking for permission.</p>
<p><a href="https://www.iana.org/domains/example">More information...</a></p>
</div>
</body>
</html>
print(soup_ex.title.text)  # Print the page title
Example Domain
# Extract all URLs
for link in soup_ex.find_all('a'):
    print(link.get('href'))
https://www.iana.org/domains/example
soup_ex.get_text()
'\n\n\nExample Domain\n\n\n\n\n\n\n\nExample Domain\nThis domain is for use in illustrative examples in documents. You may use this\n    domain in literature without prior coordination or asking for permission.\nMore information...\n\n\n\n'

46.1 rPoster Scrape

import pandas as pd
soup_rposter = fetch_urlsoup("https://radiopaedia.org/courses/rposter")
link_df = pd.DataFrame({
    "href": [link.get('href') for link in soup_rposter.find_all('a')],
    "string": [link.string for link in soup_rposter.find_all('a')]
    })
link_df.head()
href string
0 # None
1 /edits?lang=us Recent Edits
2 /sessions/new?lang=us Log In
3 /articles?lang=us Articles
4 /users/sign_up?lang=us Sign Up
link_df_mod = (
    link_df
    .pipe(lambda df: df[df["href"].str.contains("/courses/rposter/pages", regex = True, na=False)])
    .pipe(lambda df: df.assign(href_full = "https://radiopaedia.org" + df["href"]))
)
link_df_mod.head()
href string href_full
77 /courses/rposter/pages/2409 10 Must-Know In Abdominal Imaging https://radiopaedia.org/courses/rposter/pages/...
78 /courses/rposter/pages/2480 A Beginner's Guide To: Perianal Fistulas https://radiopaedia.org/courses/rposter/pages/...
79 /courses/rposter/pages/2489 An Overview Of Polyposis Syndromes https://radiopaedia.org/courses/rposter/pages/...
80 /courses/rposter/pages/2416 Appendicitis: A Declassified Guide To Imaging https://radiopaedia.org/courses/rposter/pages/...
81 /courses/rposter/pages/2438 Bowel Ultrasound https://radiopaedia.org/courses/rposter/pages/...

46.1.0.1 Get Download URL

import time
soup_pdf_url = {}

for index, row in link_df_mod[0:2].iterrows():
    url = row["href_full"]
    title = row["string"]
    try:
        soup_dl_page_url = fetch_urlsoup(url)
        soup_pdf_url[title] = [link.get('href') 
                                for link in soup_dl_page_url.find_all('a') 
                                if link.string == "DOWNLOAD PDF"][0]
        time.sleep(1)
    except Exception as e:
        print(f"Error: {e}")
soup_pdf_url
{'10 Must-Know In Abdominal Imaging': 'https://prod-images-static.radiopaedia.org/page_images/5054/R24-004_10_must-know_in_abdominal_imaging.pdf',
 "A Beginner's Guide To: Perianal Fistulas": 'https://prod-images-static.radiopaedia.org/page_images/5100/R24-259_A_Beginner_s_Guide_to_Perianal_Fistulas.pdf'}

46.1.1 Download PDF

from pathlib import Path

def download_pdf(url, output_path = None):
    """Download a PDF file from a URL and save it to the specified path."""
    # Send GET request
    response = requests.get(url, stream=True)
    response.raise_for_status()  # Raise an exception for bad responses
    
    if output_path is None:
        output_path = Path(url).name
    
    # Write content to file
    with open(output_path, 'wb') as file:
        for chunk in response.iter_content(chunk_size=8192):
            file.write(chunk)
    
    return output_path
download_pdf(soup_pdf_url["10 Must-Know In Abdominal Imaging"])
'R24-004_10_must-know_in_abdominal_imaging.pdf'