Data: extract / Extract texts

Extract text from a specific element

tasks.py

Copied!


from robocorp.tasks import task
from robocorp import browser

@task
def extract_text_from_element():
    """Extract text from a specific element on a webpage."""
    # Open a browser and navigate to the webpage
    page = browser.goto("https://en.wikipedia.org/wiki/Robot")

    # Extract text from a specific element using different methods

    # Method 1: Using text_content() - gets all text content
    heading = page.locator("h1#firstHeading")
    heading_text = heading.text_content()
    print(f"Page heading: {heading_text}")

    # Method 2: Using inner_text() - gets only visible text (excludes hidden elements)
    first_paragraph = page.locator("#mw-content-text > div.mw-parser-output > p:first-of-type")
    paragraph_text = first_paragraph.inner_text()
    print(f"First paragraph: {paragraph_text}")

    # Method 3: Using evaluate() with JavaScript - allows custom JS extraction
    page_title = page.evaluate("document.title")
    print(f"Page title (via JavaScript): {page_title}")

    # Extract text from a specific table cell
    table_cell = page.locator("table.infobox tr:nth-child(3) td")
    if table_cell.count() > 0:
        cell_text = table_cell.inner_text()
        print(f"Info table cell: {cell_text}")

    # Extract and process a list of elements
    list_items = page.locator("ul.vector-toc-contents li")
    item_count = list_items.count()
    print(f"Found {item_count} table of contents items")

    # Extract text from the first 5 list items (or all if fewer than 5)
    for i in range(min(5, item_count)):
        item_text = list_items.nth(i).inner_text()
        print(f"  TOC Item {i+1}: {item_text}")

    # Save the extracted text to a file
    with open("output/extracted_text.txt", "w", encoding="utf-8") as f:
        f.write(f"Heading: {heading_text}\n\n")
        f.write(f"First Paragraph: {paragraph_text}\n\n")
        f.write(f"Page Title: {page_title}\n")

Extract structured data

tasks.py

Copied!


import csv
import json
from robocorp.tasks import task
from robocorp import browser

@task
def extract_structured_data():
    """Extract structured data (like tables) from a webpage and save to CSV or JSON."""
    # Open a browser and navigate to the webpage with tabular data
    page = browser.goto("https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)")

    # Wait for the table to be visible
    page.wait_for_selector("table.wikitable")

    # Extract data from the table
    table = page.locator("table.wikitable")
    rows = table.locator("tbody > tr")

    # Initialize list to store the data
    countries_data = []

    # Get the number of rows
    row_count = rows.count()
    print(f"Found {row_count} rows in the table")

    # Extract header row to get column names
    header_cells = table.locator("thead > tr > th")
    headers = []
    for i in range(header_cells.count()):
        header_text = header_cells.nth(i).inner_text().strip()
        headers.append(header_text)

    # Print the headers
    print(f"Table headers: {headers}")

    # Process each row (skip the header row)
    for i in range(1, min(15, row_count)):  # Limit to first 15 rows for demonstration
        # Get all cells in the row
        row = rows.nth(i)
        cells = row.locator("td")

        # Skip rows that don't have enough cells
        if cells.count() < 3:
            continue

        # Extract data from cells
        try:
            rank = cells.nth(0).inner_text().strip()
            country = cells.nth(1).inner_text().strip()
            population = cells.nth(2).inner_text().strip().replace(",", "")

            # Create a data entry
            country_data = {
                "Rank": rank,
                "Country": country,
                "Population": population
            }

            # Add to our data list
            countries_data.append(country_data)

            print(f"Extracted data for {country}: Rank {rank}, Population {population}")

        except Exception as e:
            print(f"Error extracting data from row {i}: {e}")

    # Make sure output directory exists
    import os
    os.makedirs("output", exist_ok=True)

    # Save data as CSV
    with open("output/countries.csv", "w", newline="", encoding="utf-8") as csvfile:
        # Create a CSV writer
        writer = csv.DictWriter(csvfile, fieldnames=["Rank", "Country", "Population"])

        # Write the header row
        writer.writeheader()

        # Write all data rows
        writer.writerows(countries_data)

    # Save data as JSON
    with open("output/countries.json", "w", encoding="utf-8") as jsonfile:
        json.dump(countries_data, jsonfile, indent=2)

    print(f"Extracted data for {len(countries_data)} countries and saved to CSV and JSON formats")

Extract text using regular expressions

tasks.py

Copied!


import re
import os
from robocorp.tasks import task
from robocorp import browser

@task
def extract_text_with_regex():
    """Extract specific text patterns from a webpage using regular expressions."""
    # Open a browser and navigate to the webpage
    page = browser.goto("https://en.wikipedia.org/wiki/Email_address")

    # Get the entire page content
    page_content = page.content()

    # Define regular expressions for different patterns

    # 1. Extract email addresses
    email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    emails = re.findall(email_pattern, page_content)

    # Remove duplicates by converting to a set and back to a list
    unique_emails = list(set(emails))

    print(f"Found {len(unique_emails)} unique email addresses")
    for email in unique_emails[:5]:  # Print just the first 5
        print(f"  - {email}")

    # 2. Extract dates in format YYYY-MM-DD
    date_pattern = r'\b\d{4}-\d{2}-\d{2}\b'
    dates = re.findall(date_pattern, page_content)

    print(f"Found {len(dates)} dates in YYYY-MM-DD format")
    for date in dates[:5]:
        print(f"  - {date}")

    # 3. Extract URLs
    url_pattern = r'https?://[^\s<>"\']*[^\s<>"\',.]'
    urls = re.findall(url_pattern, page_content)

    print(f"Found {len(urls)} URLs")
    for url in urls[:5]:
        print(f"  - {url}")

    # 4. Extract section headings (text between  tags)
    # Note: Regular expressions aren't ideal for parsing HTML, but this is a simple example
    heading_pattern = r']*>.*?]*>(.*?)'
    headings = re.findall(heading_pattern, page_content)

    print(f"Found {len(headings)} section headings")
    for heading in headings:
        print(f"  - {heading}")

    # 5. Extract text in parentheses
    parentheses_pattern = r'\([^()]*\)'
    parentheses_text = re.findall(parentheses_pattern, page_content)

    print(f"Found {len(parentheses_text)} text segments in parentheses")
    for text in parentheses_text[:5]:
        print(f"  - {text}")

    # Save all extracted data to a file
    os.makedirs("output", exist_ok=True)
    with open("output/regex_extractions.txt", "w", encoding="utf-8") as f:
        f.write("===== EMAIL ADDRESSES =====\n")
        for email in unique_emails:
            f.write(f"{email}\n")

        f.write("\n===== DATES (YYYY-MM-DD) =====\n")
        for date in dates:
            f.write(f"{date}\n")

        f.write("\n===== URLs =====\n")
        for url in urls:
            f.write(f"{url}\n")

        f.write("\n===== SECTION HEADINGS =====\n")
        for heading in headings:
            f.write(f"{heading}\n")

    print(f"All extracted data saved to output/regex_extractions.txt")

RPA / Robocorp

Extract text from a specific element

Extract structured data

Extract text using regular expressions