How to Build a Job Board Scraper in Under an Hour Using ScrapeUp's API
Extract job listings from Indeed, LinkedIn, and Glassdoor with a single API call. Step-by-step Python tutorial — no proxy management, no CAPTCHA headaches.
Job listings are one of the richest real-time data sources on the internet. They reveal which companies are hiring, what skills are in demand, and where salaries are moving — before that information shows up in any analyst report. Whether you're building a job aggregator, tracking hiring trends, or monitoring competitor growth signals, programmatic access to job data is a genuine edge.
The problem? Job boards are notoriously hard to scrape. Indeed, LinkedIn, and Glassdoor deploy aggressive bot detection — rate limiting, IP blocks, JavaScript rendering requirements, and CAPTCHAs. Most DIY scrapers break within days.
That's exactly the problem ScrapeUp solves. One API call. Rendered HTML. Real residential IPs. CAPTCHAs bypassed. You just get the data.
How ScrapeUp's API Works
ScrapeUp acts as a managed scraping layer between your code and the target website. You pass a URL, ScrapeUp handles everything — browser rendering, proxy rotation, CAPTCHA solving, and retry logic — and returns the clean HTML of the rendered page.
The base endpoint:
GET https://api.scrapeup.com?api_key=YOUR_KEY&url=TARGET_URL
Key parameters:
| Parameter | Required | Description |
|---|---|---|
api_key | Required | Your ScrapeUp API key. |
url | Required | The fully-encoded target URL to scrape. |
render | Optional | Set to true to execute JavaScript before returning HTML. Essential for React/SPA job boards. |
country_code | Optional | Route through a residential IP in a specific country (e.g. us, gb). |
premium_proxy | Optional | Set to true for residential proxy routing — highest success rate on protected sites. |
Step 1 — Install Dependencies and Configure Your Client
Grab your free API key at scrapeup.com — free accounts include 1,000 credits/month. Then install the libraries:
pip install requests beautifulsoup4 pandasNow create a reusable scrape helper you'll use throughout this tutorial:
import requests
# Your ScrapeUp API key
API_KEY = "YOUR_SCRAPEUP_API_KEY"
BASE_URL = "https://api.scrapeup.com"
def scrape(url, render=False, premium=False, country=None):
"""
Fetch a URL through ScrapeUp's API.
Returns the rendered HTML content as a string.
"""
params = {
"api_key": API_KEY,
"url": url,
"render": "true" if render else "false",
}
if premium:
params["premium_proxy"] = "true"
if country:
params["country_code"] = country
response = requests.get(BASE_URL, params=params, timeout=30)
response.raise_for_status()
return response.textStep 2 — Scrape Indeed Job Listings
Indeed is the highest-volume job board in the US. Their search URL is clean and predictable. We'll scrape software engineering jobs in Austin, TX as a real-world example. Indeed's search URL pattern is straightforward:
https://www.indeed.com/jobs?q=software+engineer&l=Austin%2C+TX&sort=date
from bs4 import BeautifulSoup
import json
from config import scrape
def scrape_indeed_jobs(query, location, pages=3):
"""
Scrape job listings from Indeed.
Args:
query: Job title or keywords (e.g. "software engineer")
location: City and state (e.g. "Austin, TX")
pages: Number of result pages to scrape (10 jobs each)
"""
all_jobs = []
for page in range(0, pages * 10, 10):
url = (
f"https://www.indeed.com/jobs"
f"?q={query.replace(' ', '+')}&l={location.replace(' ', '+').replace(',', '%2C')}"
f"&start={page}&sort=date"
)
print(f"Scraping page {page // 10 + 1}: {url}")
# Indeed requires JS rendering + premium proxy for reliability
html = scrape(url, render=True, premium=True, country="us")
soup = BeautifulSoup(html, "html.parser")
# Indeed job cards use the data-jk attribute as a unique job ID
job_cards = soup.find_all("div", attrs={"data-jk": True})
for card in job_cards:
job_id = card.get("data-jk")
title_el = card.find("h2", class_="jobTitle")
title = title_el.get_text(strip=True) if title_el else "N/A"
company_el = card.find("span", attrs={"data-testid": "company-name"})
company = company_el.get_text(strip=True) if company_el else "N/A"
loc_el = card.find("div", attrs={"data-testid": "text-location"})
loc = loc_el.get_text(strip=True) if loc_el else "N/A"
salary_el = card.find("div", class_="metadata salary-snippet-container")
salary = salary_el.get_text(strip=True) if salary_el else "Not listed"
snippet_el = card.find("div", class_="job-snippet")
snippet = snippet_el.get_text(strip=True) if snippet_el else ""
all_jobs.append({
"id": job_id,
"title": title,
"company": company,
"location": loc,
"salary": salary,
"snippet": snippet,
"url": f"https://www.indeed.com/viewjob?jk={job_id}",
"source": "indeed"
})
print(f" → Found {len(job_cards)} jobs on this page")
return all_jobs
if __name__ == "__main__":
jobs = scrape_indeed_jobs("software engineer", "Austin, TX", pages=3)
print(f"\n✅ Scraped {len(jobs)} total jobs")
with open("indeed_jobs.json", "w") as f:
json.dump(jobs, f, indent=2)
render=True, premium=True when targeting Indeed.Step 3 — Scrape LinkedIn Jobs
LinkedIn job data is valuable because it includes company size, seniority level, and date posted — fields Indeed doesn't expose. LinkedIn's public job search works without a login, but it's heavily JavaScript-rendered and fingerprinted. ScrapeUp handles this with premium residential proxies.
from bs4 import BeautifulSoup
import json, re
from config import scrape
def scrape_linkedin_jobs(keywords, location, pages=2):
"""
Scrape LinkedIn public job listings.
Uses 25-result pagination (LinkedIn's default page size).
"""
all_jobs = []
for page in range(0, pages * 25, 25):
url = (
f"https://www.linkedin.com/jobs/search/"
f"?keywords={keywords.replace(' ', '%20')}"
f"&location={location.replace(' ', '%20')}"
f"&start={page}&sortBy=DD"
)
print(f"Scraping LinkedIn page {page // 25 + 1}...")
html = scrape(url, render=True, premium=True)
soup = BeautifulSoup(html, "html.parser")
cards = soup.find_all("li", class_=re.compile(r"jobs-search-results__list-item"))
if not cards:
cards = soup.select("div.base-card")
for card in cards:
title_el = card.find("h3", class_=re.compile(r"base-search-card__title"))
title = title_el.get_text(strip=True) if title_el else "N/A"
company_el = card.find("h4", class_=re.compile(r"base-search-card__subtitle"))
company = company_el.get_text(strip=True) if company_el else "N/A"
loc_el = card.find("span", class_=re.compile(r"job-search-card__location"))
location_text = loc_el.get_text(strip=True) if loc_el else "N/A"
date_el = card.find("time")
date_posted = date_el["datetime"] if date_el else "N/A"
link_el = card.find("a", class_=re.compile(r"base-card__full-link"))
job_url = link_el["href"].split("?")[0] if link_el else "N/A"
all_jobs.append({
"title": title,
"company": company,
"location": location_text,
"date_posted": date_posted,
"url": job_url,
"source": "linkedin"
})
print(f" → Captured {len(cards)} listings")
return all_jobs
if __name__ == "__main__":
jobs = scrape_linkedin_jobs("data engineer", "New York", pages=2)
print(f"\n✅ {len(jobs)} LinkedIn jobs collected")
with open("linkedin_jobs.json", "w") as f:
json.dump(jobs, f, indent=2)
time.sleep(2) between pages in production to stay within polite scraping limits. ScrapeUp rotates IPs, but spacing requests protects against behavioral detection.Step 4 — Scrape Glassdoor Salary and Review Data
Glassdoor's real value is the salary estimates and employer ratings attached to every listing. This is the data you want for compensation benchmarking or employer research tools. Glassdoor is the most aggressively protected of the three — always use premium proxy mode.
from bs4 import BeautifulSoup
import json
from config import scrape
def scrape_glassdoor_jobs(query, location):
url = (
f"https://www.glassdoor.com/Job/jobs.htm"
f"?sc.keyword={query.replace(' ', '+')}"
f"&locT=C&locKeyword={location.replace(' ', '+')}"
f"&includeNoSalaryJobs=true&radius=25&minRating=0.0"
)
print(f"Scraping Glassdoor: {query} in {location}")
html = scrape(url, render=True, premium=True)
soup = BeautifulSoup(html, "html.parser")
jobs = []
cards = soup.find_all("li", attrs={"data-test": "jobListing"})
for card in cards:
title_el = card.find("a", attrs={"data-test": "job-title"})
title = title_el.get_text(strip=True) if title_el else "N/A"
job_url = "https://www.glassdoor.com" + title_el["href"] if title_el else "N/A"
employer_el = card.find("div", attrs={"data-test": "employer-name"})
employer = employer_el.get_text(strip=True) if employer_el else "N/A"
loc_el = card.find("div", attrs={"data-test": "emp-location"})
location_text = loc_el.get_text(strip=True) if loc_el else "N/A"
salary_el = card.find("div", attrs={"data-test": "detailSalary"})
salary_estimate = salary_el.get_text(strip=True) if salary_el else "Not disclosed"
rating_el = card.find("span", attrs={"data-test": "rating"})
rating = rating_el.get_text(strip=True) if rating_el else "N/A"
jobs.append({
"title": title,
"employer": employer,
"location": location_text,
"salary_estimate": salary_estimate,
"employer_rating": rating,
"url": job_url,
"source": "glassdoor"
})
print(f" → Found {len(jobs)} listings")
return jobs
if __name__ == "__main__":
jobs = scrape_glassdoor_jobs("product manager", "San Francisco")
with open("glassdoor_jobs.json", "w") as f:
json.dump(jobs, f, indent=2)Step 5 — Combine and Normalize All Three Sources
Now let's merge all three scrapers into a single pipeline that deduplicates by URL and exports a clean CSV and JSON — ready for a database, dashboard, or downstream product.
import pandas as pd
import json, hashlib
from indeed_scraper import scrape_indeed_jobs
from linkedin_scraper import scrape_linkedin_jobs
from glassdoor_scraper import scrape_glassdoor_jobs
def normalize_job(job):
return {
"job_id": hashlib.md5(job.get("url", "").encode()).hexdigest()[:10],
"title": job.get("title", "N/A"),
"company": job.get("company") or job.get("employer", "N/A"),
"location": job.get("location", "N/A"),
"salary": job.get("salary") or job.get("salary_estimate", "Not listed"),
"rating": job.get("employer_rating", "N/A"),
"url": job.get("url", "N/A"),
"source": job.get("source", "unknown"),
}
def run_pipeline(query, location):
print(f"\n🚀 Starting job scrape pipeline — {query} | {location}")
indeed_jobs = scrape_indeed_jobs(query, location, pages=2)
linkedin_jobs = scrape_linkedin_jobs(query, location, pages=2)
glassdoor_jobs = scrape_glassdoor_jobs(query, location)
all_raw = indeed_jobs + linkedin_jobs + glassdoor_jobs
print(f"\n📊 Raw: {len(indeed_jobs)} Indeed | {len(linkedin_jobs)} LinkedIn | {len(glassdoor_jobs)} Glassdoor")
normalized = [normalize_job(j) for j in all_raw]
df = pd.DataFrame(normalized).drop_duplicates(subset="url", keep="first")
df.to_csv("jobs_combined.csv", index=False)
df.to_json("jobs_combined.json", orient="records", indent=2)
print(f"\n✅ {len(df)} unique jobs saved to jobs_combined.csv + jobs_combined.json")
return df
if __name__ == "__main__":
df = run_pipeline("machine learning engineer", "Remote")Step 6 — Schedule Daily Scrapes
A one-time scrape is useful. A daily scrape that keeps your dataset fresh is a product. Here's how to automate the full pipeline:
import schedule, time, os
from datetime import datetime
from pipeline import run_pipeline
import pandas as pd
SEARCHES = [
{"query": "machine learning engineer", "location": "Remote"},
{"query": "data scientist", "location": "New York"},
{"query": "product manager", "location": "San Francisco"},
]
def daily_scrape():
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M")
print(f"\n⏰ Scheduled scrape at {timestamp}")
results = [run_pipeline(s["query"], s["location"]) for s in SEARCHES]
os.makedirs("archive", exist_ok=True)
combined = pd.concat(results, ignore_index=True)
combined.to_csv(f"archive/jobs_{timestamp}.csv", index=False)
print(f"✅ Archived {len(combined)} jobs → archive/jobs_{timestamp}.csv")
daily_scrape() # Run immediately on start
schedule.every().day.at("07:00").do(daily_scrape)
print("\n📅 Scheduler running. Press Ctrl+C to stop.")
while True:
schedule.run_pending()
time.sleep(60)