RPA / Robocorp
Data: extract / Extract texts
Extract text from a specific element
tasks.py
from robocorp.tasks import task
from robocorp import browser
@task
def extract_text_from_element():
"""Extract text from a specific element on a webpage."""
# Open a browser and navigate to the webpage
page = browser.goto("https://en.wikipedia.org/wiki/Robot")
# Extract text from a specific element using different methods
# Method 1: Using text_content() - gets all text content
heading = page.locator("h1#firstHeading")
heading_text = heading.text_content()
print(f"Page heading: {heading_text}")
# Method 2: Using inner_text() - gets only visible text (excludes hidden elements)
first_paragraph = page.locator("#mw-content-text > div.mw-parser-output > p:first-of-type")
paragraph_text = first_paragraph.inner_text()
print(f"First paragraph: {paragraph_text}")
# Method 3: Using evaluate() with JavaScript - allows custom JS extraction
page_title = page.evaluate("document.title")
print(f"Page title (via JavaScript): {page_title}")
# Extract text from a specific table cell
table_cell = page.locator("table.infobox tr:nth-child(3) td")
if table_cell.count() > 0:
cell_text = table_cell.inner_text()
print(f"Info table cell: {cell_text}")
# Extract and process a list of elements
list_items = page.locator("ul.vector-toc-contents li")
item_count = list_items.count()
print(f"Found {item_count} table of contents items")
# Extract text from the first 5 list items (or all if fewer than 5)
for i in range(min(5, item_count)):
item_text = list_items.nth(i).inner_text()
print(f" TOC Item {i+1}: {item_text}")
# Save the extracted text to a file
with open("output/extracted_text.txt", "w", encoding="utf-8") as f:
f.write(f"Heading: {heading_text}\n\n")
f.write(f"First Paragraph: {paragraph_text}\n\n")
f.write(f"Page Title: {page_title}\n")
Extract structured data
tasks.py
import csv
import json
from robocorp.tasks import task
from robocorp import browser
@task
def extract_structured_data():
"""Extract structured data (like tables) from a webpage and save to CSV or JSON."""
# Open a browser and navigate to the webpage with tabular data
page = browser.goto("https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)")
# Wait for the table to be visible
page.wait_for_selector("table.wikitable")
# Extract data from the table
table = page.locator("table.wikitable")
rows = table.locator("tbody > tr")
# Initialize list to store the data
countries_data = []
# Get the number of rows
row_count = rows.count()
print(f"Found {row_count} rows in the table")
# Extract header row to get column names
header_cells = table.locator("thead > tr > th")
headers = []
for i in range(header_cells.count()):
header_text = header_cells.nth(i).inner_text().strip()
headers.append(header_text)
# Print the headers
print(f"Table headers: {headers}")
# Process each row (skip the header row)
for i in range(1, min(15, row_count)): # Limit to first 15 rows for demonstration
# Get all cells in the row
row = rows.nth(i)
cells = row.locator("td")
# Skip rows that don't have enough cells
if cells.count() < 3:
continue
# Extract data from cells
try:
rank = cells.nth(0).inner_text().strip()
country = cells.nth(1).inner_text().strip()
population = cells.nth(2).inner_text().strip().replace(",", "")
# Create a data entry
country_data = {
"Rank": rank,
"Country": country,
"Population": population
}
# Add to our data list
countries_data.append(country_data)
print(f"Extracted data for {country}: Rank {rank}, Population {population}")
except Exception as e:
print(f"Error extracting data from row {i}: {e}")
# Make sure output directory exists
import os
os.makedirs("output", exist_ok=True)
# Save data as CSV
with open("output/countries.csv", "w", newline="", encoding="utf-8") as csvfile:
# Create a CSV writer
writer = csv.DictWriter(csvfile, fieldnames=["Rank", "Country", "Population"])
# Write the header row
writer.writeheader()
# Write all data rows
writer.writerows(countries_data)
# Save data as JSON
with open("output/countries.json", "w", encoding="utf-8") as jsonfile:
json.dump(countries_data, jsonfile, indent=2)
print(f"Extracted data for {len(countries_data)} countries and saved to CSV and JSON formats")
Extract text using regular expressions
tasks.py
import re
import os
from robocorp.tasks import task
from robocorp import browser
@task
def extract_text_with_regex():
"""Extract specific text patterns from a webpage using regular expressions."""
# Open a browser and navigate to the webpage
page = browser.goto("https://en.wikipedia.org/wiki/Email_address")
# Get the entire page content
page_content = page.content()
# Define regular expressions for different patterns
# 1. Extract email addresses
email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
emails = re.findall(email_pattern, page_content)
# Remove duplicates by converting to a set and back to a list
unique_emails = list(set(emails))
print(f"Found {len(unique_emails)} unique email addresses")
for email in unique_emails[:5]: # Print just the first 5
print(f" - {email}")
# 2. Extract dates in format YYYY-MM-DD
date_pattern = r'\b\d{4}-\d{2}-\d{2}\b'
dates = re.findall(date_pattern, page_content)
print(f"Found {len(dates)} dates in YYYY-MM-DD format")
for date in dates[:5]:
print(f" - {date}")
# 3. Extract URLs
url_pattern = r'https?://[^\s<>"\']*[^\s<>"\',.]'
urls = re.findall(url_pattern, page_content)
print(f"Found {len(urls)} URLs")
for url in urls[:5]:
print(f" - {url}")
# 4. Extract section headings (text between tags)
# Note: Regular expressions aren't ideal for parsing HTML, but this is a simple example
heading_pattern = r']*>.*?]*>(.*?)'
headings = re.findall(heading_pattern, page_content)
print(f"Found {len(headings)} section headings")
for heading in headings:
print(f" - {heading}")
# 5. Extract text in parentheses
parentheses_pattern = r'\([^()]*\)'
parentheses_text = re.findall(parentheses_pattern, page_content)
print(f"Found {len(parentheses_text)} text segments in parentheses")
for text in parentheses_text[:5]:
print(f" - {text}")
# Save all extracted data to a file
os.makedirs("output", exist_ok=True)
with open("output/regex_extractions.txt", "w", encoding="utf-8") as f:
f.write("===== EMAIL ADDRESSES =====\n")
for email in unique_emails:
f.write(f"{email}\n")
f.write("\n===== DATES (YYYY-MM-DD) =====\n")
for date in dates:
f.write(f"{date}\n")
f.write("\n===== URLs =====\n")
for url in urls:
f.write(f"{url}\n")
f.write("\n===== SECTION HEADINGS =====\n")
for heading in headings:
f.write(f"{heading}\n")
print(f"All extracted data saved to output/regex_extractions.txt")