RPA / Robocorp
Data: extract / Extract images
Download images from a container
tasks.py
import os
import re
import urllib.parse
from pathlib import Path
from typing import List, Tuple, Optional
from robocorp.tasks import task
from robocorp import browser
WEB_URL = "https://unsplash.com/s/photos/landscape"
IMAGES_CONTAINER = "div.ripi6" # CSS selector for the container with images
IMAGES_RENAME = True # Set to False to keep original filenames
IMAGES_FORMATS = [".jpg", ".jpeg", ".png", ".webp"] # File extensions to download
IMAGES_WIDTH_RANGE = (800, 5000) # Only download images in this width range (min, max)
@task
def download_images_from_container():
"""
Download images from a specific container on a webpage.
This robot:
1. Opens a webpage
2. Finds image elements within a specified container
3. Filters images by format and size constraints
4. Downloads the images to a local folder
"""
# Create output directory
output_dir = Path("output/images")
output_dir.mkdir(parents=True, exist_ok=True)
# Open browser and navigate to the page
page = browser.goto(WEB_URL)
# Wait for the images container to be visible
page.wait_for_selector(IMAGES_CONTAINER, state="visible")
# Find all image elements inside the container
container = page.locator(IMAGES_CONTAINER)
img_elements = container.locator("img")
count = img_elements.count()
print(f"Found {count} image elements")
# Process each image
downloaded_count = 0
for i in range(count):
img = img_elements.nth(i)
# Extract image URL (src attribute)
src = img.get_attribute("src")
if not src:
print(f"Image {i+1}/{count}: No src attribute found")
continue
# Some sites use data-src or other attributes for lazy loading
if src.startswith("data:"):
src = img.get_attribute("data-src") or img.get_attribute("data-original") or src
# Skip if not a valid URL
if not src or src.startswith("data:image/"):
print(f"Image {i+1}/{count}: Invalid or data URL")
continue
# Make URL absolute if it's relative
if src.startswith("/"):
parsed_url = urllib.parse.urlparse(WEB_URL)
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
src = f"{base_url}{src}"
# Check file extension
file_ext = get_file_extension(src)
if file_ext.lower() not in IMAGES_FORMATS:
print(f"Image {i+1}/{count}: Format {file_ext} not in allowed formats")
continue
# Get image width (if available)
width_str = img.get_attribute("width")
if width_str and width_str.isdigit():
width = int(width_str)
if width < IMAGES_WIDTH_RANGE[0] or width > IMAGES_WIDTH_RANGE[1]:
print(f"Image {i+1}/{count}: Width {width}px outside allowed range")
continue
# Generate filename
if IMAGES_RENAME:
filename = f"image_{downloaded_count + 1}{file_ext}"
else:
filename = os.path.basename(urllib.parse.urlparse(src).path)
output_path = output_dir / filename
# Download the image
try:
# Use page.goto for simplicity, for more control you can use
# requests or http libraries
with browser.page() as download_page:
download_page.goto(src)
# Wait for load to ensure the image is fully loaded
download_page.wait_for_load_state("networkidle")
# Save as file
download_page.screenshot(path=str(output_path))
downloaded_count += 1
print(f"Image {i+1}/{count}: Downloaded to {output_path}")
except Exception as e:
print(f"Image {i+1}/{count}: Download failed: {e}")
print(f"\nDownload complete: {downloaded_count}/{count} images downloaded to {output_dir}")
def get_file_extension(url: str) -> str:
"""Extract file extension from URL."""
# Remove URL parameters
url = url.split("?")[0]
# Get the path component
parsed = urllib.parse.urlparse(url)
path = parsed.path
# Extract extension
_, ext = os.path.splitext(path)
# If no extension found, default to .jpg
return ext if ext else ".jpg"