RPA / Robocorp

Data: extract / Extract images

Download images from a container

tasks.py
Copied!

import os
import re
import urllib.parse
from pathlib import Path
from typing import List, Tuple, Optional

from robocorp.tasks import task
from robocorp import browser

WEB_URL = "https://unsplash.com/s/photos/landscape"
IMAGES_CONTAINER = "div.ripi6"  # CSS selector for the container with images
IMAGES_RENAME = True  # Set to False to keep original filenames
IMAGES_FORMATS = [".jpg", ".jpeg", ".png", ".webp"]  # File extensions to download
IMAGES_WIDTH_RANGE = (800, 5000)  # Only download images in this width range (min, max)

@task
def download_images_from_container():
    """
    Download images from a specific container on a webpage.

    This robot:
    1. Opens a webpage
    2. Finds image elements within a specified container
    3. Filters images by format and size constraints
    4. Downloads the images to a local folder
    """
    # Create output directory
    output_dir = Path("output/images")
    output_dir.mkdir(parents=True, exist_ok=True)

    # Open browser and navigate to the page
    page = browser.goto(WEB_URL)

    # Wait for the images container to be visible
    page.wait_for_selector(IMAGES_CONTAINER, state="visible")

    # Find all image elements inside the container
    container = page.locator(IMAGES_CONTAINER)
    img_elements = container.locator("img")
    count = img_elements.count()

    print(f"Found {count} image elements")

    # Process each image
    downloaded_count = 0
    for i in range(count):
        img = img_elements.nth(i)

        # Extract image URL (src attribute)
        src = img.get_attribute("src")
        if not src:
            print(f"Image {i+1}/{count}: No src attribute found")
            continue

        # Some sites use data-src or other attributes for lazy loading
        if src.startswith("data:"):
            src = img.get_attribute("data-src") or img.get_attribute("data-original") or src

        # Skip if not a valid URL
        if not src or src.startswith("data:image/"):
            print(f"Image {i+1}/{count}: Invalid or data URL")
            continue

        # Make URL absolute if it's relative
        if src.startswith("/"):
            parsed_url = urllib.parse.urlparse(WEB_URL)
            base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
            src = f"{base_url}{src}"

        # Check file extension
        file_ext = get_file_extension(src)
        if file_ext.lower() not in IMAGES_FORMATS:
            print(f"Image {i+1}/{count}: Format {file_ext} not in allowed formats")
            continue

        # Get image width (if available)
        width_str = img.get_attribute("width")
        if width_str and width_str.isdigit():
            width = int(width_str)
            if width < IMAGES_WIDTH_RANGE[0] or width > IMAGES_WIDTH_RANGE[1]:
                print(f"Image {i+1}/{count}: Width {width}px outside allowed range")
                continue

        # Generate filename
        if IMAGES_RENAME:
            filename = f"image_{downloaded_count + 1}{file_ext}"
        else:
            filename = os.path.basename(urllib.parse.urlparse(src).path)

        output_path = output_dir / filename

        # Download the image
        try:
            # Use page.goto for simplicity, for more control you can use
            # requests or http libraries
            with browser.page() as download_page:
                download_page.goto(src)
                # Wait for load to ensure the image is fully loaded
                download_page.wait_for_load_state("networkidle")
                # Save as file
                download_page.screenshot(path=str(output_path))

            downloaded_count += 1
            print(f"Image {i+1}/{count}: Downloaded to {output_path}")
        except Exception as e:
            print(f"Image {i+1}/{count}: Download failed: {e}")

    print(f"\nDownload complete: {downloaded_count}/{count} images downloaded to {output_dir}")


def get_file_extension(url: str) -> str:
    """Extract file extension from URL."""
    # Remove URL parameters
    url = url.split("?")[0]

    # Get the path component
    parsed = urllib.parse.urlparse(url)
    path = parsed.path

    # Extract extension
    _, ext = os.path.splitext(path)

    # If no extension found, default to .jpg
    return ext if ext else ".jpg"