A comprehensive web scraping solution using JSoup library with advanced features like concurrent scraping, data extraction, and export capabilities.
Dependencies
<dependencies> <!-- JSoup HTML Parser --> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.17.1</version> </dependency> <!-- HTTP Client --> <dependency> <groupId>org.apache.httpcomponents.client5</groupId> <artifactId>httpclient5</artifactId> <version>5.2.1</version> </dependency> <!-- JSON Processing --> <dependency> <groupId>com.fasterxml.jackson.core</groupId> <artifactId>jackson-databind</artifactId> <version>2.15.2</version> </dependency> <!-- CSV Writing --> <dependency> <groupId>com.opencsv</groupId> <artifactId>opencsv</artifactId> <version>5.8</version> </dependency> <!-- Logging --> <dependency> <groupId>ch.qos.logback</groupId> <artifactId>logback-classic</artifactId> <version>1.4.11</version> </dependency> <!-- Concurrency Utilities --> <dependency> <groupId>io.github.resilience4j</groupId> <artifactId>resilience4j-retry</artifactId> <version>2.1.0</version> </dependency> </dependencies>
Core Scraping Engine
Example 1: Basic JSoup Scraper
package com.webscraper.core;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class BasicScraper {
private static final Logger logger = LoggerFactory.getLogger(BasicScraper.class);
private final ScraperConfig config;
public BasicScraper() {
this.config = new ScraperConfig();
}
public BasicScraper(ScraperConfig config) {
this.config = config;
}
public Document fetchDocument(String url) throws IOException {
logger.info("Fetching document from: {}", url);
return Jsoup.connect(url)
.userAgent(config.getUserAgent())
.timeout(config.getTimeout())
.referrer(config.getReferrer())
.headers(config.getDefaultHeaders())
.get();
}
public List<String> extractLinks(String url, String cssSelector) throws IOException {
Document doc = fetchDocument(url);
Elements linkElements = doc.select(cssSelector);
List<String> links = new ArrayList<>();
for (Element link : linkElements) {
String href = link.attr("abs:href");
if (!href.isEmpty()) {
links.add(href);
}
}
logger.info("Extracted {} links from {}", links.size(), url);
return links;
}
public Map<String, String> extractTextContent(String url, Map<String, String> selectors)
throws IOException {
Document doc = fetchDocument(url);
Map<String, String> content = new HashMap<>();
for (Map.Entry<String, String> entry : selectors.entrySet()) {
String key = entry.getKey();
String selector = entry.getValue();
Elements elements = doc.select(selector);
if (!elements.isEmpty()) {
content.put(key, elements.text());
} else {
content.put(key, "");
logger.warn("No elements found for selector: {} on {}", selector, url);
}
}
return content;
}
public List<Map<String, String>> extractTableData(String url, String tableSelector)
throws IOException {
Document doc = fetchDocument(url);
Element table = doc.selectFirst(tableSelector);
if (table == null) {
logger.warn("No table found with selector: {} on {}", tableSelector, url);
return new ArrayList<>();
}
List<Map<String, String>> tableData = new ArrayList<>();
Elements rows = table.select("tr");
// Extract headers if available
Elements headerCells = rows.first().select("th, td");
List<String> headers = new ArrayList<>();
for (Element header : headerCells) {
headers.add(header.text().trim());
}
// If no headers found, generate generic ones
if (headers.isEmpty()) {
int colCount = rows.first().select("td").size();
for (int i = 0; i < colCount; i++) {
headers.add("Column_" + (i + 1));
}
}
// Extract data rows
for (int i = 1; i < rows.size(); i++) {
Element row = rows.get(i);
Elements cells = row.select("td");
if (cells.size() == headers.size()) {
Map<String, String> rowData = new HashMap<>();
for (int j = 0; j < cells.size(); j++) {
rowData.put(headers.get(j), cells.get(j).text().trim());
}
tableData.add(rowData);
}
}
logger.info("Extracted {} rows from table on {}", tableData.size(), url);
return tableData;
}
public void scrapeAndSave(String url, String outputPath) throws IOException {
Document doc = fetchDocument(url);
// Extract various content types
String title = doc.title();
String bodyText = doc.body().text();
Elements images = doc.select("img[src]");
Elements links = doc.select("a[href]");
// Create scraped data object
ScrapedData data = new ScrapedData();
data.setUrl(url);
data.setTitle(title);
data.setContent(bodyText);
data.setImageCount(images.size());
data.setLinkCount(links.size());
data.setTimestamp(System.currentTimeMillis());
// Save to file
DataExporter exporter = new DataExporter();
exporter.exportToJson(data, outputPath);
logger.info("Scraped data saved to: {}", outputPath);
}
}
Example 2: Advanced Concurrent Scraper
package com.webscraper.core;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.*;
import java.util.concurrent.atomic.AtomicInteger;
public class ConcurrentScraper {
private static final Logger logger = LoggerFactory.getLogger(ConcurrentScraper.class);
private final ScraperConfig config;
private final ExecutorService executor;
private final AtomicInteger successCount;
private final AtomicInteger failureCount;
public ConcurrentScraper(ScraperConfig config) {
this.config = config;
this.executor = Executors.newFixedThreadPool(config.getMaxConcurrentRequests());
this.successCount = new AtomicInteger(0);
this.failureCount = new AtomicInteger(0);
}
public List<ScrapedData> scrapeUrls(List<String> urls) {
List<ScrapedData> results = new ArrayList<>();
List<Future<ScrapedData>> futures = new ArrayList<>();
logger.info("Starting concurrent scrape of {} URLs", urls.size());
// Submit all scraping tasks
for (String url : urls) {
Future<ScrapedData> future = executor.submit(() -> scrapeSingleUrl(url));
futures.add(future);
}
// Collect results
for (Future<ScrapedData> future : futures) {
try {
ScrapedData result = future.get(config.getTimeout() + 5000, TimeUnit.MILLISECONDS);
if (result != null) {
results.add(result);
successCount.incrementAndGet();
}
} catch (Exception e) {
failureCount.incrementAndGet();
logger.warn("Failed to get scraping result: {}", e.getMessage());
}
}
logger.info("Scraping completed: {} successes, {} failures",
successCount.get(), failureCount.get());
return results;
}
public ScrapedData scrapeSingleUrl(String url) {
ScrapedData data = new ScrapedData();
data.setUrl(url);
data.setTimestamp(System.currentTimeMillis());
try {
Document doc = Jsoup.connect(url)
.userAgent(config.getUserAgent())
.timeout(config.getTimeout())
.headers(config.getDefaultHeaders())
.get();
// Extract basic information
data.setTitle(doc.title());
data.setContent(doc.body().text());
// Extract metadata
data.setMetaDescription(doc.select("meta[name=description]").attr("content"));
data.setMetaKeywords(doc.select("meta[name=keywords]").attr("content"));
// Extract links
data.setLinks(extractLinks(doc));
data.setImages(extractImages(doc));
// Calculate statistics
data.setWordCount(countWords(data.getContent()));
data.setLinkCount(data.getLinks().size());
data.setImageCount(data.getImages().size());
data.setSuccess(true);
logger.debug("Successfully scraped: {}", url);
} catch (IOException e) {
data.setSuccess(false);
data.setError(e.getMessage());
logger.warn("Failed to scrape {}: {}", url, e.getMessage());
}
return data;
}
private List<String> extractLinks(Document doc) {
List<String> links = new ArrayList<>();
doc.select("a[href]").forEach(link -> {
String href = link.attr("abs:href");
if (!href.isEmpty()) {
links.add(href);
}
});
return links;
}
private List<String> extractImages(Document doc) {
List<String> images = new ArrayList<>();
doc.select("img[src]").forEach(img -> {
String src = img.attr("abs:src");
if (!src.isEmpty()) {
images.add(src);
}
});
return images;
}
private int countWords(String text) {
if (text == null || text.trim().isEmpty()) {
return 0;
}
return text.trim().split("\\s+").length;
}
public void shutdown() {
executor.shutdown();
try {
if (!executor.awaitTermination(60, TimeUnit.SECONDS)) {
executor.shutdownNow();
}
} catch (InterruptedException e) {
executor.shutdownNow();
Thread.currentThread().interrupt();
}
}
public int getSuccessCount() {
return successCount.get();
}
public int getFailureCount() {
return failureCount.get();
}
}
Data Models
Example 3: Scraped Data Model
package com.webscraper.model;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.annotation.JsonProperty;
import java.util.ArrayList;
import java.util.List;
@JsonInclude(JsonInclude.Include.NON_NULL)
public class ScrapedData {
@JsonProperty("url")
private String url;
@JsonProperty("title")
private String title;
@JsonProperty("content")
private String content;
@JsonProperty("meta_description")
private String metaDescription;
@JsonProperty("meta_keywords")
private String metaKeywords;
@JsonProperty("links")
private List<String> links = new ArrayList<>();
@JsonProperty("images")
private List<String> images = new ArrayList<>();
@JsonProperty("word_count")
private int wordCount;
@JsonProperty("link_count")
private int linkCount;
@JsonProperty("image_count")
private int imageCount;
@JsonProperty("timestamp")
private long timestamp;
@JsonProperty("success")
private boolean success;
@JsonProperty("error")
private String error;
// Constructors
public ScrapedData() {}
public ScrapedData(String url) {
this.url = url;
this.timestamp = System.currentTimeMillis();
}
// Getters and Setters
public String getUrl() { return url; }
public void setUrl(String url) { this.url = url; }
public String getTitle() { return title; }
public void setTitle(String title) { this.title = title; }
public String getContent() { return content; }
public void setContent(String content) { this.content = content; }
public String getMetaDescription() { return metaDescription; }
public void setMetaDescription(String metaDescription) { this.metaDescription = metaDescription; }
public String getMetaKeywords() { return metaKeywords; }
public void setMetaKeywords(String metaKeywords) { this.metaKeywords = metaKeywords; }
public List<String> getLinks() { return links; }
public void setLinks(List<String> links) { this.links = links; }
public List<String> getImages() { return images; }
public void setImages(List<String> images) { this.images = images; }
public int getWordCount() { return wordCount; }
public void setWordCount(int wordCount) { this.wordCount = wordCount; }
public int getLinkCount() { return linkCount; }
public void setLinkCount(int linkCount) { this.linkCount = linkCount; }
public int getImageCount() { return imageCount; }
public void setImageCount(int imageCount) { this.imageCount = imageCount; }
public long getTimestamp() { return timestamp; }
public void setTimestamp(long timestamp) { this.timestamp = timestamp; }
public boolean isSuccess() { return success; }
public void setSuccess(boolean success) { this.success = success; }
public String getError() { return error; }
public void setError(String error) { this.error = error; }
@Override
public String toString() {
return String.format(
"ScrapedData{url='%s', title='%s', wordCount=%d, links=%d, images=%d, success=%s}",
url, title, wordCount, linkCount, imageCount, success
);
}
}
Example 4: Scraper Configuration
package com.webscraper.config;
import java.util.HashMap;
import java.util.Map;
public class ScraperConfig {
private String userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36";
private int timeout = 30000; // 30 seconds
private int maxConcurrentRequests = 5;
private String referrer = "https://www.google.com";
private boolean followRedirects = true;
private boolean ignoreHttpErrors = true;
private int maxRetries = 3;
private long delayBetweenRequests = 1000; // 1 second
private Map<String, String> defaultHeaders;
public ScraperConfig() {
this.defaultHeaders = new HashMap<>();
setupDefaultHeaders();
}
private void setupDefaultHeaders() {
defaultHeaders.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
defaultHeaders.put("Accept-Language", "en-US,en;q=0.5");
defaultHeaders.put("Accept-Encoding", "gzip, deflate");
defaultHeaders.put("Connection", "keep-alive");
defaultHeaders.put("Upgrade-Insecure-Requests", "1");
}
// Getters and Setters
public String getUserAgent() { return userAgent; }
public void setUserAgent(String userAgent) { this.userAgent = userAgent; }
public int getTimeout() { return timeout; }
public void setTimeout(int timeout) { this.timeout = timeout; }
public int getMaxConcurrentRequests() { return maxConcurrentRequests; }
public void setMaxConcurrentRequests(int maxConcurrentRequests) { this.maxConcurrentRequests = maxConcurrentRequests; }
public String getReferrer() { return referrer; }
public void setReferrer(String referrer) { this.referrer = referrer; }
public boolean isFollowRedirects() { return followRedirects; }
public void setFollowRedirects(boolean followRedirects) { this.followRedirects = followRedirects; }
public boolean isIgnoreHttpErrors() { return ignoreHttpErrors; }
public void setIgnoreHttpErrors(boolean ignoreHttpErrors) { this.ignoreHttpErrors = ignoreHttpErrors; }
public int getMaxRetries() { return maxRetries; }
public void setMaxRetries(int maxRetries) { this.maxRetries = maxRetries; }
public long getDelayBetweenRequests() { return delayBetweenRequests; }
public void setDelayBetweenRequests(long delayBetweenRequests) { this.delayBetweenRequests = delayBetweenRequests; }
public Map<String, String> getDefaultHeaders() { return defaultHeaders; }
public void setDefaultHeaders(Map<String, String> defaultHeaders) { this.defaultHeaders = defaultHeaders; }
public void addHeader(String name, String value) {
this.defaultHeaders.put(name, value);
}
}
Specialized Scrapers
Example 5: E-commerce Product Scraper
package com.webscraper.specialized;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class EcommerceScraper {
private static final Logger logger = LoggerFactory.getLogger(EcommerceScraper.class);
private final ScraperConfig config;
public EcommerceScraper(ScraperConfig config) {
this.config = config;
}
public Product extractProduct(String url) throws IOException {
Document doc = Jsoup.connect(url)
.userAgent(config.getUserAgent())
.timeout(config.getTimeout())
.get();
Product product = new Product();
product.setUrl(url);
// Extract product name
product.setName(extractProductName(doc));
// Extract price
product.setPrice(extractPrice(doc));
product.setCurrency(extractCurrency(doc));
// Extract description
product.setDescription(extractDescription(doc));
// Extract images
product.setImages(extractProductImages(doc));
// Extract specifications
product.setSpecifications(extractSpecifications(doc));
// Extract availability
product.setAvailable(extractAvailability(doc));
// Extract rating and reviews
product.setRating(extractRating(doc));
product.setReviewCount(extractReviewCount(doc));
product.setTimestamp(System.currentTimeMillis());
logger.info("Extracted product: {}", product.getName());
return product;
}
public List<Product> extractProductsFromListing(String listingUrl, String productSelector)
throws IOException {
Document doc = Jsoup.connect(listingUrl)
.userAgent(config.getUserAgent())
.timeout(config.getTimeout())
.get();
List<Product> products = new ArrayList<>();
Elements productElements = doc.select(productSelector);
for (Element productElement : productElements) {
Product product = new Product();
// Extract product link
Element linkElement = productElement.selectFirst("a[href]");
if (linkElement != null) {
String productUrl = linkElement.attr("abs:href");
product.setUrl(productUrl);
}
// Extract product name
Element nameElement = productElement.selectFirst(".product-name, .title, h3, h4");
if (nameElement != null) {
product.setName(nameElement.text());
}
// Extract price
String priceText = productElement.selectFirst(".price, .cost, .amount").text();
product.setPrice(parsePrice(priceText));
// Extract image
Element imageElement = productElement.selectFirst("img[src]");
if (imageElement != null) {
product.getImages().add(imageElement.attr("abs:src"));
}
products.add(product);
}
logger.info("Extracted {} products from listing", products.size());
return products;
}
private String extractProductName(Document doc) {
// Try multiple selectors for product name
String[] selectors = {
"h1.product-title", "h1.product-name", "h1.title",
"meta[property='og:title']", "title"
};
for (String selector : selectors) {
Element element = doc.selectFirst(selector);
if (element != null) {
if (selector.startsWith("meta")) {
return element.attr("content");
}
return element.text();
}
}
return "";
}
private Double extractPrice(Document doc) {
// Try multiple selectors for price
String[] selectors = {
".price", ".product-price", ".cost", ".amount",
"[itemprop=price]", ".current-price"
};
for (String selector : selectors) {
Element element = doc.selectFirst(selector);
if (element != null) {
return parsePrice(element.text());
}
}
return null;
}
private Double parsePrice(String priceText) {
if (priceText == null || priceText.trim().isEmpty()) {
return null;
}
// Regex to extract numeric price value
Pattern pattern = Pattern.compile("([0-9]+[.,]?[0-9]*)");
Matcher matcher = pattern.matcher(priceText);
if (matcher.find()) {
String priceStr = matcher.group(1).replace(",", ".");
try {
return Double.parseDouble(priceStr);
} catch (NumberFormatException e) {
logger.warn("Failed to parse price: {}", priceText);
}
}
return null;
}
private String extractCurrency(Document doc) {
// Try to extract currency from price or meta tags
String[] priceSelectors = {".price", ".product-price", "[itemprop=price]"};
for (String selector : priceSelectors) {
Element element = doc.selectFirst(selector);
if (element != null) {
String priceText = element.text();
if (priceText.contains("$")) return "USD";
if (priceText.contains("€")) return "EUR";
if (priceText.contains("£")) return "GBP";
}
}
// Check meta tags for currency
Element currencyMeta = doc.selectFirst("meta[property='price:currency']");
if (currencyMeta != null) {
return currencyMeta.attr("content");
}
return "USD"; // Default
}
private String extractDescription(Document doc) {
String[] selectors = {
".product-description", ".description", "[itemprop=description]",
".product-details", "#description"
};
for (String selector : selectors) {
Element element = doc.selectFirst(selector);
if (element != null) {
return element.text();
}
}
return "";
}
private List<String> extractProductImages(Document doc) {
List<String> images = new ArrayList<>();
String[] selectors = {
".product-image img", ".gallery img", "[itemprop=image]",
".main-image img", ".product-photo img"
};
for (String selector : selectors) {
Elements imgElements = doc.select(selector);
for (Element img : imgElements) {
String src = img.attr("abs:src");
if (!src.isEmpty() && !images.contains(src)) {
images.add(src);
}
}
}
return images;
}
private Map<String, String> extractSpecifications(Document doc) {
Map<String, String> specs = new HashMap<>();
// Try table-based specifications
Elements specRows = doc.select("table.specifications tr, table.features tr");
for (Element row : specRows) {
Elements cells = row.select("td, th");
if (cells.size() >= 2) {
String key = cells.get(0).text().replace(":", "").trim();
String value = cells.get(1).text().trim();
specs.put(key, value);
}
}
// Try list-based specifications
Elements specItems = doc.select(".specifications li, .features li");
for (Element item : specItems) {
String text = item.text();
if (text.contains(":")) {
String[] parts = text.split(":", 2);
if (parts.length == 2) {
specs.put(parts[0].trim(), parts[1].trim());
}
}
}
return specs;
}
private boolean extractAvailability(Document doc) {
String[] inStockSelectors = {
".in-stock", ".available", "[itemprop=availability]",
".stock", ".availability"
};
for (String selector : inStockSelectors) {
Element element = doc.selectFirst(selector);
if (element != null) {
String text = element.text().toLowerCase();
return !text.contains("out of stock") &&
!text.contains("unavailable") &&
!text.contains("sold out");
}
}
return true; // Assume available if not specified
}
private Double extractRating(Document doc) {
// Try to extract rating from various structures
String[] ratingSelectors = {
"[itemprop=ratingValue]", ".rating", ".product-rating",
".star-rating", ".review-rating"
};
for (String selector : ratingSelectors) {
Element element = doc.selectFirst(selector);
if (element != null) {
String ratingText = element.text();
try {
return Double.parseDouble(ratingText);
} catch (NumberFormatException e) {
// Try to extract from class names (e.g., star ratings)
String classAttr = element.className();
if (classAttr.matches(".*[0-9]+.*")) {
Pattern pattern = Pattern.compile("([0-9]+)");
Matcher matcher = pattern.matcher(classAttr);
if (matcher.find()) {
return Double.parseDouble(matcher.group(1));
}
}
}
}
}
return null;
}
private Integer extractReviewCount(Document doc) {
String[] reviewSelectors = {
"[itemprop=reviewCount]", ".review-count", ".rating-count"
};
for (String selector : reviewSelectors) {
Element element = doc.selectFirst(selector);
if (element != null) {
String countText = element.text();
try {
return Integer.parseInt(countText.replaceAll("[^0-9]", ""));
} catch (NumberFormatException e) {
logger.warn("Failed to parse review count: {}", countText);
}
}
}
return null;
}
}
Example 6: Product Data Model
package com.webscraper.model;
import com.fasterxml.jackson.annotation.JsonProperty;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class Product {
@JsonProperty("name")
private String name;
@JsonProperty("url")
private String url;
@JsonProperty("price")
private Double price;
@JsonProperty("currency")
private String currency;
@JsonProperty("description")
private String description;
@JsonProperty("images")
private List<String> images = new ArrayList<>();
@JsonProperty("specifications")
private Map<String, String> specifications = new HashMap<>();
@JsonProperty("available")
private boolean available;
@JsonProperty("rating")
private Double rating;
@JsonProperty("review_count")
private Integer reviewCount;
@JsonProperty("timestamp")
private long timestamp;
// Getters and Setters
public String getName() { return name; }
public void setName(String name) { this.name = name; }
public String getUrl() { return url; }
public void setUrl(String url) { this.url = url; }
public Double getPrice() { return price; }
public void setPrice(Double price) { this.price = price; }
public String getCurrency() { return currency; }
public void setCurrency(String currency) { this.currency = currency; }
public String getDescription() { return description; }
public void setDescription(String description) { this.description = description; }
public List<String> getImages() { return images; }
public void setImages(List<String> images) { this.images = images; }
public Map<String, String> getSpecifications() { return specifications; }
public void setSpecifications(Map<String, String> specifications) { this.specifications = specifications; }
public boolean isAvailable() { return available; }
public void setAvailable(boolean available) { this.available = available; }
public Double getRating() { return rating; }
public void setRating(Double rating) { this.rating = rating; }
public Integer getReviewCount() { return reviewCount; }
public void setReviewCount(Integer reviewCount) { this.reviewCount = reviewCount; }
public long getTimestamp() { return timestamp; }
public void setTimestamp(long timestamp) { this.timestamp = timestamp; }
@Override
public String toString() {
return String.format(
"Product{name='%s', price=%s%s, available=%s, rating=%s}",
name, price, currency, available, rating
);
}
}
Data Exporters
Example 7: Multi-Format Data Exporter
package com.webscraper.export;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SerializationFeature;
import com.opencsv.CSVWriter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.FileWriter;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;
public class DataExporter {
private static final Logger logger = LoggerFactory.getLogger(DataExporter.class);
private final ObjectMapper objectMapper;
public DataExporter() {
this.objectMapper = new ObjectMapper();
this.objectMapper.enable(SerializationFeature.INDENT_OUTPUT);
}
public void exportToJson(List<?> data, String filePath) throws IOException {
ensureDirectoryExists(filePath);
objectMapper.writeValue(Paths.get(filePath).toFile(), data);
logger.info("Exported {} items to JSON: {}", data.size(), filePath);
}
public void exportToCsv(List<ScrapedData> data, String filePath) throws IOException {
ensureDirectoryExists(filePath);
try (CSVWriter writer = new CSVWriter(new FileWriter(filePath))) {
// Write header
String[] header = {
"URL", "Title", "Word Count", "Link Count", "Image Count",
"Timestamp", "Success", "Error"
};
writer.writeNext(header);
// Write data
for (ScrapedData item : data) {
String[] row = {
item.getUrl(),
item.getTitle(),
String.valueOf(item.getWordCount()),
String.valueOf(item.getLinkCount()),
String.valueOf(item.getImageCount()),
String.valueOf(item.getTimestamp()),
String.valueOf(item.isSuccess()),
item.getError() != null ? item.getError() : ""
};
writer.writeNext(row);
}
}
logger.info("Exported {} items to CSV: {}", data.size(), filePath);
}
public void exportProductsToCsv(List<Product> products, String filePath) throws IOException {
ensureDirectoryExists(filePath);
try (CSVWriter writer = new CSVWriter(new FileWriter(filePath))) {
// Write header
String[] header = {
"Name", "URL", "Price", "Currency", "Available",
"Rating", "Review Count", "Image Count"
};
writer.writeNext(header);
// Write data
for (Product product : products) {
String[] row = {
product.getName(),
product.getUrl(),
product.getPrice() != null ? String.valueOf(product.getPrice()) : "",
product.getCurrency(),
String.valueOf(product.isAvailable()),
product.getRating() != null ? String.valueOf(product.getRating()) : "",
product.getReviewCount() != null ? String.valueOf(product.getReviewCount()) : "",
String.valueOf(product.getImages().size())
};
writer.writeNext(row);
}
}
logger.info("Exported {} products to CSV: {}", products.size(), filePath);
}
public void exportToText(List<ScrapedData> data, String filePath) throws IOException {
ensureDirectoryExists(filePath);
StringBuilder content = new StringBuilder();
content.append("Web Scraping Results\n");
content.append("====================\n\n");
for (ScrapedData item : data) {
content.append("URL: ").append(item.getUrl()).append("\n");
content.append("Title: ").append(item.getTitle()).append("\n");
content.append("Word Count: ").append(item.getWordCount()).append("\n");
content.append("Links: ").append(item.getLinkCount()).append("\n");
content.append("Images: ").append(item.getImageCount()).append("\n");
content.append("Success: ").append(item.isSuccess()).append("\n");
if (item.getError() != null) {
content.append("Error: ").append(item.getError()).append("\n");
}
content.append("\n").append("-".repeat(80)).append("\n\n");
}
Files.write(Paths.get(filePath), content.toString().getBytes());
logger.info("Exported {} items to text: {}", data.size(), filePath);
}
private void ensureDirectoryExists(String filePath) throws IOException {
Path path = Paths.get(filePath);
Path parentDir = path.getParent();
if (parentDir != null && !Files.exists(parentDir)) {
Files.createDirectories(parentDir);
}
}
public String generateFilename(String prefix, String extension) {
String timestamp = String.valueOf(System.currentTimeMillis());
return String.format("%s_%s.%s", prefix, timestamp, extension);
}
}
Utility Classes
Example 8: URL Utilities and Validators
package com.webscraper.util;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
public class UrlUtils {
private static final Logger logger = LoggerFactory.getLogger(UrlUtils.class);
private static final Pattern URL_PATTERN = Pattern.compile(
"^(https?|ftp)://[\\w\\-]+(\\.[\\w\\-]+)+([\\w\\-.,@?^=%&:/~+#]*[\\w\\-@?^=%&/~+#])?$"
);
public static boolean isValidUrl(String url) {
if (url == null || url.trim().isEmpty()) {
return false;
}
try {
new URL(url);
return URL_PATTERN.matcher(url).matches();
} catch (MalformedURLException e) {
return false;
}
}
public static String normalizeUrl(String url) {
if (url == null) return null;
url = url.trim();
if (!url.startsWith("http://") && !url.startsWith("https://")) {
url = "https://" + url;
}
return url;
}
public static String getDomain(String url) {
try {
URL parsedUrl = new URL(url);
return parsedUrl.getHost();
} catch (MalformedURLException e) {
logger.warn("Failed to parse domain from URL: {}", url);
return null;
}
}
public static List<String> filterUrlsByDomain(List<String> urls, String domain) {
List<String> filtered = new ArrayList<>();
for (String url : urls) {
if (getDomain(url).equals(domain)) {
filtered.add(url);
}
}
return filtered;
}
public static List<String> removeDuplicates(List<String> urls) {
return new ArrayList<>(new LinkedHashSet<>(urls));
}
public static boolean isSameDomain(String url1, String url2) {
String domain1 = getDomain(url1);
String domain2 = getDomain(url2);
return domain1 != null && domain1.equals(domain2);
}
}
Main Application Class
Example 9: Command Line Interface
package com.webscraper;
import com.webscraper.config.ScraperConfig;
import com.webscraper.core.ConcurrentScraper;
import com.webscraper.export.DataExporter;
import com.webscraper.model.ScrapedData;
import com.webscraper.util.UrlUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.Scanner;
public class WebScraperCLI {
private static final Logger logger = LoggerFactory.getLogger(WebScraperCLI.class);
private final ScraperConfig config;
private final ConcurrentScraper scraper;
private final DataExporter exporter;
public WebScraperCLI() {
this.config = new ScraperConfig();
this.scraper = new ConcurrentScraper(config);
this.exporter = new DataExporter();
}
public static void main(String[] args) {
WebScraperCLI cli = new WebScraperCLI();
cli.run();
}
public void run() {
Scanner scanner = new Scanner(System.in);
System.out.println("=== Web Scraper with JSoup ===");
System.out.println("1. Scrape single URL");
System.out.println("2. Scrape multiple URLs from file");
System.out.println("3. Scrape with sitemap");
System.out.println("4. Configure settings");
System.out.println("5. Exit");
while (true) {
System.out.print("\nChoose option (1-5): ");
String choice = scanner.nextLine();
switch (choice) {
case "1":
scrapeSingleUrl(scanner);
break;
case "2":
scrapeFromFile(scanner);
break;
case "3":
scrapeWithSitemap(scanner);
break;
case "4":
configureSettings(scanner);
break;
case "5":
System.out.println("Goodbye!");
scraper.shutdown();
return;
default:
System.out.println("Invalid option. Please try again.");
}
}
}
private void scrapeSingleUrl(Scanner scanner) {
System.out.print("Enter URL to scrape: ");
String url = scanner.nextLine();
if (!UrlUtils.isValidUrl(url)) {
System.out.println("Invalid URL format.");
return;
}
try {
List<String> urls = List.of(url);
List<ScrapedData> results = scraper.scrapeUrls(urls);
if (!results.isEmpty()) {
exportResults(results, scanner);
}
} catch (Exception e) {
logger.error("Error during scraping", e);
System.out.println("Error: " + e.getMessage());
}
}
private void scrapeFromFile(Scanner scanner) {
System.out.print("Enter path to URL file: ");
String filePath = scanner.nextLine();
try {
List<String> urls = Files.readAllLines(Paths.get(filePath));
urls = urls.stream()
.filter(UrlUtils::isValidUrl)
.collect(Collectors.toList());
if (urls.isEmpty()) {
System.out.println("No valid URLs found in file.");
return;
}
System.out.printf("Found %d valid URLs. Starting scrape...\n", urls.size());
List<ScrapedData> results = scraper.scrapeUrls(urls);
exportResults(results, scanner);
} catch (IOException e) {
System.out.println("Error reading file: " + e.getMessage());
}
}
private void scrapeWithSitemap(Scanner scanner) {
System.out.print("Enter sitemap URL: ");
String sitemapUrl = scanner.nextLine();
try {
SitemapParser sitemapParser = new SitemapParser(config);
List<String> urls = sitemapParser.parseSitemap(sitemapUrl);
System.out.printf("Found %d URLs in sitemap. Starting scrape...\n", urls.size());
List<ScrapedData> results = scraper.scrapeUrls(urls);
exportResults(results, scanner);
} catch (Exception e) {
System.out.println("Error parsing sitemap: " + e.getMessage());
}
}
private void configureSettings(Scanner scanner) {
System.out.println("\n=== Configuration ===");
System.out.println("1. Set user agent");
System.out.println("2. Set timeout (ms)");
System.out.println("3. Set concurrent requests");
System.out.println("4. Set request delay (ms)");
System.out.println("5. Back to main menu");
System.out.print("Choose option: ");
String choice = scanner.nextLine();
switch (choice) {
case "1":
System.out.print("Enter user agent: ");
config.setUserAgent(scanner.nextLine());
break;
case "2":
System.out.print("Enter timeout (ms): ");
config.setTimeout(Integer.parseInt(scanner.nextLine()));
break;
case "3":
System.out.print("Enter concurrent requests: ");
config.setMaxConcurrentRequests(Integer.parseInt(scanner.nextLine()));
break;
case "4":
System.out.print("Enter request delay (ms): ");
config.setDelayBetweenRequests(Long.parseLong(scanner.nextLine()));
break;
case "5":
return;
default:
System.out.println("Invalid option.");
}
System.out.println("Configuration updated.");
}
private void exportResults(List<ScrapedData> results, Scanner scanner) {
System.out.println("\nScraping completed!");
System.out.printf("Success: %d, Failures: %d\n",
scraper.getSuccessCount(), scraper.getFailureCount());
System.out.println("\nExport options:");
System.out.println("1. JSON");
System.out.println("2. CSV");
System.out.println("3. Text");
System.out.println("4. Don't export");
System.out.print("Choose format: ");
String format = scanner.nextLine();
if (format.equals("4")) {
return;
}
System.out.print("Enter output file path: ");
String outputPath = scanner.nextLine();
try {
switch (format) {
case "1":
exporter.exportToJson(results, outputPath);
break;
case "2":
exporter.exportToCsv(results, outputPath);
break;
case "3":
exporter.exportToText(results, outputPath);
break;
default:
System.out.println("Invalid format.");
return;
}
System.out.println("Results exported successfully to: " + outputPath);
} catch (IOException e) {
System.out.println("Error exporting results: " + e.getMessage());
}
}
}
Advanced Features
Example 10: Sitemap Parser
package com.webscraper.core;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class SitemapParser {
private static final Logger logger = LoggerFactory.getLogger(SitemapParser.class);
private final ScraperConfig config;
public SitemapParser(ScraperConfig config) {
this.config = config;
}
public List<String> parseSitemap(String sitemapUrl) throws IOException {
List<String> urls = new ArrayList<>();
Document doc = Jsoup.connect(sitemapUrl)
.userAgent(config.getUserAgent())
.timeout(config.getTimeout())
.ignoreContentType(true) // Sitemaps are XML
.get();
// Check if it's a sitemap index
if (doc.select("sitemapindex").size() > 0) {
urls.addAll(parseSitemapIndex(doc));
} else {
urls.addAll(parseUrlset(doc));
}
logger.info("Parsed {} URLs from sitemap: {}", urls.size(), sitemapUrl);
return urls;
}
private List<String> parseSitemapIndex(Document doc) {
List<String> sitemapUrls = new ArrayList<>();
Elements sitemapElements = doc.select("sitemap > loc");
for (Element sitemap : sitemapElements) {
String sitemapUrl = sitemap.text();
try {
sitemapUrls.addAll(parseSitemap(sitemapUrl));
} catch (IOException e) {
logger.warn("Failed to parse nested sitemap: {}", sitemapUrl);
}
}
return sitemapUrls;
}
private List<String> parseUrlset(Document doc) {
List<String> urls = new ArrayList<>();
Elements urlElements = doc.select("url > loc");
for (Element urlElement : urlElements) {
urls.add(urlElement.text());
}
return urls;
}
public List<String> discoverSitemaps(String domainUrl) throws IOException {
List<String> sitemapUrls = new ArrayList<>();
String[] commonSitemapPaths = {
"/sitemap.xml",
"/sitemap_index.xml",
"/sitemap.php",
"/sitemap.txt",
"/robots.txt"
};
for (String path : commonSitemapPaths) {
String sitemapUrl = domainUrl + path;
try {
Jsoup.connect(sitemapUrl)
.userAgent(config.getUserAgent())
.timeout(5000)
.execute();
sitemapUrls.add(sitemapUrl);
logger.info("Found sitemap: {}", sitemapUrl);
} catch (IOException e) {
// Sitemap not found at this path
}
}
// Also check robots.txt for sitemap directives
try {
Document robots = Jsoup.connect(domainUrl + "/robots.txt")
.userAgent(config.getUserAgent())
.timeout(5000)
.get();
String robotsContent = robots.body().text();
String[] lines = robotsContent.split("\n");
for (String line : lines) {
if (line.toLowerCase().startsWith("sitemap:")) {
String sitemapUrl = line.substring(8).trim();
sitemapUrls.add(sitemapUrl);
logger.info("Found sitemap in robots.txt: {}", sitemapUrl);
}
}
} catch (IOException e) {
// robots.txt not found or inaccessible
}
return sitemapUrls;
}
}
Features Summary
This Web Scraper with JSoup provides:
- Basic Scraping - Extract text, links, images, and tables
- Concurrent Processing - Scrape multiple URLs simultaneously
- E-commerce Focus - Specialized product data extraction
- Multiple Export Formats - JSON, CSV, and text
- Sitemap Support - Automatic discovery and parsing of sitemaps
- Configurable Settings - Customizable timeouts, user agents, and headers
- Error Handling - Robust error handling and retry mechanisms
- Data Validation - URL validation and data cleaning
The scraper is extensible and can be enhanced with features like:
- JavaScript rendering support
- CAPTCHA solving
- Proxy rotation
- Rate limiting
- Database storage
- Web interface
- API endpoints
- Scheduled scraping
- Advanced data analysis