Apache Tika for Document Parsing in Java

Introduction

Apache Tika is a powerful content analysis and detection toolkit that can extract text and metadata from over a thousand different file types. It provides a unified API for parsing various document formats including PDF, Microsoft Office documents, images, and more.

Setup and Dependencies

Maven Dependencies

<dependencies>
<!-- Apache Tika Core -->
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>2.9.1</version>
</dependency>
<!-- Apache Tika Parsers -->
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers-standard-package</artifactId>
<version>2.9.1</version>
</dependency>
<!-- For OCR functionality -->
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parser-ocr-module</artifactId>
<version>2.9.1</version>
</dependency>
<!-- For language detection -->
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-langdetect-optimaize</artifactId>
<version>2.9.1</version>
</dependency>
</dependencies>

Gradle Dependencies

dependencies {
implementation 'org.apache.tika:tika-core:2.9.1'
implementation 'org.apache.tika:tika-parsers-standard-package:2.9.1'
implementation 'org.apache.tika:tika-parser-ocr-module:2.9.1'
implementation 'org.apache.tika:tika-langdetect-optimaize:2.9.1'
}

Basic Document Parsing

Simple Text Extraction

import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.SAXException;
import java.io.*;
import java.nio.file.Files;
import java.nio.file.Path;
public class BasicTikaParser {
public String extractTextFromFile(File file) throws IOException, TikaException, SAXException {
// Method 1: Using Tika facade (simplest approach)
Tika tika = new Tika();
tika.setMaxStringLength(100 * 1024 * 1024); // Set max text length to 100MB
return tika.parseToString(file);
}
public String extractTextWithMetadata(File file) throws IOException, TikaException, SAXException {
// Method 2: Using parser directly for more control
Parser parser = new AutoDetectParser();
BodyContentHandler handler = new BodyContentHandler(-1); // -1 for no limit
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
try (InputStream stream = new FileInputStream(file)) {
parser.parse(stream, handler, metadata, context);
return handler.toString();
}
}
public void parseFileWithDetailedControl(Path filePath) throws IOException, TikaException, SAXException {
AutoDetectParser parser = new AutoDetectParser();
BodyContentHandler handler = new BodyContentHandler(10 * 1024 * 1024); // 10MB limit
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
try (InputStream stream = Files.newInputStream(filePath)) {
// Set the resource name for better type detection
metadata.set(Metadata.RESOURCE_NAME_KEY, filePath.getFileName().toString());
parser.parse(stream, handler, metadata, context);
// Process results
String content = handler.toString();
System.out.println("Extracted content: " + content.substring(0, Math.min(200, content.length())) + "...");
// Print all metadata
printAllMetadata(metadata);
}
}
private void printAllMetadata(Metadata metadata) {
System.out.println("\n=== METADATA ===");
for (String name : metadata.names()) {
System.out.println(name + ": " + metadata.get(name));
}
}
public static void main(String[] args) {
BasicTikaParser parser = new BasicTikaParser();
try {
File sampleFile = new File("sample.pdf");
if (sampleFile.exists()) {
String content = parser.extractTextFromFile(sampleFile);
System.out.println("Extracted text length: " + content.length());
System.out.println("First 500 chars: " + content.substring(0, Math.min(500, content.length())));
}
} catch (Exception e) {
e.printStackTrace();
}
}
}

Advanced Parsing Techniques

Custom Parser with Error Handling

import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.UnsupportedFormatException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.SafeContentHandler;
import org.xml.sax.ContentHandler;
import java.io.*;
import java.nio.file.Path;
import java.util.concurrent.atomic.AtomicLong;
public class AdvancedTikaParser {
private final TikaConfig tikaConfig;
private final Detector detector;
private final AtomicLong processedFiles;
public AdvancedTikaParser() throws Exception {
this.tikaConfig = TikaConfig.getDefaultConfig();
this.detector = tikaConfig.getDetector();
this.processedFiles = new AtomicLong(0);
}
public static class ParseResult {
private final String content;
private final Metadata metadata;
private final MediaType mediaType;
private final ParseStatus status;
public ParseResult(String content, Metadata metadata, MediaType mediaType, ParseStatus status) {
this.content = content;
this.metadata = metadata;
this.mediaType = mediaType;
this.status = status;
}
// Getters
public String getContent() { return content; }
public Metadata getMetadata() { return metadata; }
public MediaType getMediaType() { return mediaType; }
public ParseStatus getStatus() { return status; }
}
public enum ParseStatus {
SUCCESS,
ENCRYPTED,
UNSUPPORTED_FORMAT,
PARSE_ERROR,
TOO_LARGE
}
public ParseResult parseDocument(Path filePath) {
return parseDocument(filePath, 50 * 1024 * 1024); // 50MB default limit
}
public ParseResult parseDocument(Path filePath, long maxContentLength) {
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
ContentHandler handler = new SafeContentHandler(
new BodyContentHandler(maxContentLength)
);
try (TikaInputStream tikaStream = TikaInputStream.get(filePath, metadata)) {
// Detect media type
MediaType mediaType = detector.detect(tikaStream, metadata);
metadata.set(Metadata.CONTENT_TYPE, mediaType.toString());
// Get appropriate parser
Parser parser = tikaConfig.getParser();
context.set(Parser.class, parser);
// Parse the document
parser.parse(tikaStream, handler, metadata, context);
processedFiles.incrementAndGet();
return new ParseResult(
handler.toString(),
metadata,
mediaType,
ParseStatus.SUCCESS
);
} catch (EncryptedDocumentException e) {
return new ParseResult("", new Metadata(), null, ParseStatus.ENCRYPTED);
} catch (UnsupportedFormatException e) {
return new ParseResult("", new Metadata(), null, ParseStatus.UNSUPPORTED_FORMAT);
} catch (IOException e) {
if (e.getMessage() != null && e.getMessage().contains("Your document contained more than")) {
return new ParseResult("", new Metadata(), null, ParseStatus.TOO_LARGE);
}
return new ParseResult("", new Metadata(), null, ParseStatus.PARSE_ERROR);
} catch (Exception e) {
return new ParseResult("", new Metadata(), null, ParseStatus.PARSE_ERROR);
}
}
public void processDirectory(Path directory) throws IOException {
Files.walk(directory)
.filter(Files::isRegularFile)
.forEach(file -> {
ParseResult result = parseDocument(file);
System.out.println("File: " + file.getFileName());
System.out.println("Status: " + result.getStatus());
System.out.println("Media Type: " + 
(result.getMediaType() != null ? result.getMediaType() : "Unknown"));
if (result.getStatus() == ParseStatus.SUCCESS) {
System.out.println("Content length: " + result.getContent().length());
System.out.println("Title: " + result.getMetadata().get(Metadata.TITLE));
System.out.println("Author: " + result.getMetadata().get(Metadata.AUTHOR));
}
System.out.println("---");
});
}
public long getProcessedFilesCount() {
return processedFiles.get();
}
}

Metadata Extraction

Comprehensive Metadata Handling

import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import java.io.*;
import java.nio.file.Path;
import java.util.*;
public class MetadataExtractor {
public static class DocumentMetadata {
private final Map<String, List<String>> metadata;
private final String mediaType;
private final long fileSize;
private final Date createdDate;
private final Date modifiedDate;
public DocumentMetadata(Map<String, List<String>> metadata, String mediaType, 
long fileSize, Date createdDate, Date modifiedDate) {
this.metadata = metadata;
this.mediaType = mediaType;
this.fileSize = fileSize;
this.createdDate = createdDate;
this.modifiedDate = modifiedDate;
}
// Getters
public Map<String, List<String>> getMetadata() { return metadata; }
public String getMediaType() { return mediaType; }
public long getFileSize() { return fileSize; }
public Date getCreatedDate() { return createdDate; }
public Date getModifiedDate() { return modifiedDate; }
}
public DocumentMetadata extractMetadata(Path filePath) throws IOException {
return extractMetadata(filePath.toFile());
}
public DocumentMetadata extractMetadata(File file) throws IOException {
Metadata metadata = new Metadata();
Parser parser = new AutoDetectParser();
ParseContext context = new ParseContext();
// Set file name for better detection
metadata.set(Metadata.RESOURCE_NAME_KEY, file.getName());
try (InputStream stream = new FileInputStream(file)) {
// We only care about metadata, so use a dummy handler
BodyContentHandler handler = new BodyContentHandler(0);
parser.parse(stream, handler, metadata, context);
return buildDocumentMetadata(metadata, file);
} catch (Exception e) {
throw new IOException("Failed to extract metadata from " + file.getName(), e);
}
}
private DocumentMetadata buildDocumentMetadata(Metadata metadata, File file) {
Map<String, List<String>> metadataMap = new TreeMap<>(String.CASE_INSENSITIVE_ORDER);
// Extract all metadata fields
for (String name : metadata.names()) {
String[] values = metadata.getValues(name);
metadataMap.put(name, Arrays.asList(values));
}
// Get basic file information
String mediaType = metadata.get(Metadata.CONTENT_TYPE);
long fileSize = file.length();
// Parse dates
Date createdDate = parseDate(metadata.get(Metadata.CREATION_DATE));
Date modifiedDate = parseDate(metadata.get(Metadata.MODIFICATION_DATE));
return new DocumentMetadata(metadataMap, mediaType, fileSize, createdDate, modifiedDate);
}
private Date parseDate(String dateString) {
if (dateString == null) return null;
try {
return Metadata.parseDate(dateString);
} catch (Exception e) {
return null;
}
}
public void printFormattedMetadata(DocumentMetadata docMetadata) {
System.out.println("=== DOCUMENT METADATA ===");
System.out.println("Media Type: " + docMetadata.getMediaType());
System.out.println("File Size: " + docMetadata.getFileSize() + " bytes");
System.out.println("Created: " + docMetadata.getCreatedDate());
System.out.println("Modified: " + docMetadata.getModifiedDate());
System.out.println("\n=== ALL METADATA FIELDS ===");
docMetadata.getMetadata().forEach((key, values) -> {
System.out.println(key + ": " + String.join("; ", values));
});
}
public Map<String, String> getCommonMetadata(DocumentMetadata docMetadata) {
Map<String, String> common = new LinkedHashMap<>();
common.put("Title", getFirstValue(docMetadata, Metadata.TITLE));
common.put("Author", getFirstValue(docMetadata, Metadata.AUTHOR));
common.put("Creator", getFirstValue(docMetadata, Metadata.CREATOR));
common.put("Subject", getFirstValue(docMetadata, Metadata.SUBJECT));
common.put("Keywords", getFirstValue(docMetadata, Metadata.KEYWORDS));
common.put("Description", getFirstValue(docMetadata, Metadata.DESCRIPTION));
common.put("Created", getFirstValue(docMetadata, Metadata.CREATION_DATE));
common.put("Modified", getFirstValue(docMetadata, Metadata.MODIFICATION_DATE));
common.put("Last Saved", getFirstValue(docMetadata, Metadata.LAST_SAVED));
common.put("Page Count", getFirstValue(docMetadata, Metadata.PAGE_COUNT));
common.put("Word Count", getFirstValue(docMetadata, Metadata.WORD_COUNT));
return common;
}
private String getFirstValue(DocumentMetadata docMetadata, String key) {
List<String> values = docMetadata.getMetadata().get(key);
return values != null && !values.isEmpty() ? values.get(0) : "N/A";
}
// Specific metadata extractors for different file types
public Map<String, String> extractPdfMetadata(DocumentMetadata docMetadata) {
Map<String, String> pdfMetadata = new LinkedHashMap<>();
pdfMetadata.put("PDF Version", getFirstValue(docMetadata, "pdf:PDFVersion"));
pdfMetadata.put("Producer", getFirstValue(docMetadata, "pdf:Producer"));
pdfMetadata.put("Encrypted", getFirstValue(docMetadata, "pdf:encrypted"));
pdfMetadata.put("Tagged", getFirstValue(docMetadata, "pdf:tagged"));
return pdfMetadata;
}
public Map<String, String> extractOfficeMetadata(DocumentMetadata docMetadata) {
Map<String, String> officeMetadata = new LinkedHashMap<>();
officeMetadata.put("Application", getFirstValue(docMetadata, "Application-Name"));
officeMetadata.put("Company", getFirstValue(docMetadata, "Company"));
officeMetadata.put("Manager", getFirstValue(docMetadata, "Manager"));
officeMetadata.put("Template", getFirstValue(docMetadata, "Template"));
officeMetadata.put("Revision", getFirstValue(docMetadata, "Revision-Number"));
return officeMetadata;
}
public Map<String, String> extractImageMetadata(DocumentMetadata docMetadata) {
Map<String, String> imageMetadata = new LinkedHashMap<>();
imageMetadata.put("Width", getFirstValue(docMetadata, "tiff:ImageWidth"));
imageMetadata.put("Height", getFirstValue(docMetadata, "tiff:ImageLength"));
imageMetadata.put("Bits Per Sample", getFirstValue(docMetadata, "tiff:BitsPerSample"));
imageMetadata.put("Color Space", getFirstValue(docMetadata, "tiff:ColorSpace"));
// EXIF data
imageMetadata.put("Camera Make", getFirstValue(docMetadata, "tiff:Make"));
imageMetadata.put("Camera Model", getFirstValue(docMetadata, "tiff:Model"));
imageMetadata.put("Date Taken", getFirstValue(docMetadata, "Date/Time"));
imageMetadata.put("Exposure Time", getFirstValue(docMetadata, "exif:ExposureTime"));
imageMetadata.put("F Number", getFirstValue(docMetadata, "exif:FNumber"));
imageMetadata.put("ISO Speed", getFirstValue(docMetadata, "exif:ISOSpeedRatings"));
return imageMetadata;
}
}

OCR Integration

Optical Character Recognition with Tika

import org.apache.tika.Tika;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.apache.tika.parser.pdf.PDFParserConfig;
import org.apache.tika.sax.BodyContentHandler;
import java.io.*;
import java.nio.file.Path;
import java.util.concurrent.TimeUnit;
public class TikaOCRProcessor {
private final Tika tika;
private final TesseractOCRConfig ocrConfig;
public TikaOCRProcessor() {
this.tika = new Tika();
this.ocrConfig = createOCRConfig();
}
private TesseractOCRConfig createOCRConfig() {
TesseractOCRConfig config = new TesseractOCRConfig();
// Configure Tesseract OCR
config.setLanguage("eng+fra+deu+spa"); // Multiple languages
config.setPageSegMode("1"); // Automatic page segmentation
config.setOcrEngineMode("1"); // Neural nets LSTM engine
config.setTimeout(120); // 2 minutes timeout
config.setMaxFileSizeToOcr(50 * 1024 * 1024); // 50MB limit
config.setMinFileSizeToOcr(10); // 10 bytes minimum
// Image preprocessing
config.setEnableImagePreprocessing(true);
config.setPreserveInterwordSpacing(true);
config.setApplyRotation(true);
return config;
}
public String extractTextWithOCR(File imageFile) throws IOException, TikaException {
// Simple approach using Tika facade
tika.setMaxStringLength(100 * 1024 * 1024);
return tika.parseToString(imageFile);
}
public OCRResult extractTextWithOCRAdvanced(File file) throws IOException {
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
BodyContentHandler handler = new BodyContentHandler(-1);
// Configure OCR
context.set(TesseractOCRConfig.class, ocrConfig);
// For PDFs with OCR
PDFParserConfig pdfConfig = new PDFParserConfig();
pdfConfig.setExtractInlineImages(true);
pdfConfig.setExtractUniqueInlineImagesOnly(false);
pdfConfig.setOcrStrategy("ocr_only"); // Options: "no_ocr", "ocr_only", "auto"
context.set(PDFParserConfig.class, pdfConfig);
try (InputStream stream = new FileInputStream(file)) {
Parser parser = new AutoDetectParser();
long startTime = System.nanoTime();
parser.parse(stream, handler, metadata, context);
long endTime = System.nanoTime();
long duration = TimeUnit.NANOSECONDS.toMillis(endTime - startTime);
return new OCRResult(
handler.toString(),
metadata,
duration,
true,
null
);
} catch (Exception e) {
return new OCRResult(
"",
new Metadata(),
0,
false,
e.getMessage()
);
}
}
public static class OCRResult {
private final String extractedText;
private final Metadata metadata;
private final long processingTimeMs;
private final boolean success;
private final String errorMessage;
public OCRResult(String extractedText, Metadata metadata, long processingTimeMs, 
boolean success, String errorMessage) {
this.extractedText = extractedText;
this.metadata = metadata;
this.processingTimeMs = processingTimeMs;
this.success = success;
this.errorMessage = errorMessage;
}
// Getters
public String getExtractedText() { return extractedText; }
public Metadata getMetadata() { return metadata; }
public long getProcessingTimeMs() { return processingTimeMs; }
public boolean isSuccess() { return success; }
public String getErrorMessage() { return errorMessage; }
}
public void processImageDirectory(Path imageDir) throws IOException {
Files.walk(imageDir)
.filter(Files::isRegularFile)
.filter(this::isImageFile)
.forEach(file -> {
try {
System.out.println("Processing: " + file.getFileName());
OCRResult result = extractTextWithOCRAdvanced(file.toFile());
if (result.isSuccess()) {
System.out.println("Success! Processing time: " + result.getProcessingTimeMs() + "ms");
System.out.println("Text length: " + result.getExtractedText().length());
// Save extracted text
saveExtractedText(file, result.getExtractedText());
} else {
System.out.println("Failed: " + result.getErrorMessage());
}
} catch (Exception e) {
System.out.println("Error processing " + file.getFileName() + ": " + e.getMessage());
}
});
}
private boolean isImageFile(Path file) {
try {
String mimeType = tika.detect(file);
return mimeType.startsWith("image/");
} catch (IOException e) {
return false;
}
}
private void saveExtractedText(Path imageFile, String text) {
Path textFile = imageFile.resolveSibling(
imageFile.getFileName().toString() + ".txt"
);
try (BufferedWriter writer = Files.newBufferedWriter(textFile)) {
writer.write(text);
} catch (IOException e) {
System.err.println("Failed to save extracted text: " + e.getMessage());
}
}
// Check if Tesseract is available
public boolean isOCRAvailable() {
try {
TesseractOCRConfig config = new TesseractOCRConfig();
// This will throw an exception if Tesseract is not installed
config.getTesseractPath();
return true;
} catch (Exception e) {
return false;
}
}
}

Language Detection

Multi-language Content Processing

import org.apache.tika.language.detect.LanguageDetector;
import org.apache.tika.language.detect.LanguageResult;
import org.apache.tika.language.detect.LanguageWriter;
import java.util.*;
public class LanguageDetectionProcessor {
private final LanguageDetector languageDetector;
public LanguageDetectionProcessor() throws Exception {
// Initialize the language detector
this.languageDetector = LanguageDetector.getDefaultLanguageDetector();
this.languageDetector.loadModels();
}
public static class LanguageDetectionResult {
private final String primaryLanguage;
private final double confidence;
private final List<LanguageProbability> probabilities;
public LanguageDetectionResult(String primaryLanguage, double confidence, 
List<LanguageProbability> probabilities) {
this.primaryLanguage = primaryLanguage;
this.confidence = confidence;
this.probabilities = probabilities;
}
// Getters
public String getPrimaryLanguage() { return primaryLanguage; }
public double getConfidence() { return confidence; }
public List<LanguageProbability> getProbabilities() { return probabilities; }
}
public static class LanguageProbability {
private final String language;
private final double probability;
public LanguageProbability(String language, double probability) {
this.language = language;
this.probability = probability;
}
// Getters
public String getLanguage() { return language; }
public double getProbability() { return probability; }
}
public LanguageDetectionResult detectLanguage(String text) {
if (text == null || text.trim().isEmpty()) {
return new LanguageDetectionResult("unknown", 0.0, Collections.emptyList());
}
// For short texts, we might need to use a different approach
if (text.length() < 50) {
return detectLanguageShortText(text);
}
try (LanguageWriter writer = new LanguageWriter(languageDetector)) {
writer.append(text);
LanguageResult result = languageDetector.detect();
List<LanguageProbability> probabilities = getLanguageProbabilities();
return new LanguageDetectionResult(
result.getLanguage(),
result.getRawScore(),
probabilities
);
}
}
private LanguageDetectionResult detectLanguageShortText(String text) {
// For very short texts, use simple character-based detection
Map<String, Integer> languageScores = new HashMap<>();
// Simple heuristic: count common words/characters for different languages
if (text.matches(".*\\b(the|and|is|in|to|of|a|that|it|for)\\b.*")) {
languageScores.put("en", 10);
}
if (text.matches(".*\\b(und|der|die|das|ist|in|den|von|zu|dem)\\b.*")) {
languageScores.put("de", 10);
}
if (text.matches(".*\\b(et|le|la|les|de|un|une|est|dans|pour)\\b.*")) {
languageScores.put("fr", 10);
}
// Find the language with highest score
String detectedLang = languageScores.entrySet().stream()
.max(Map.Entry.comparingByValue())
.map(Map.Entry::getKey)
.orElse("unknown");
List<LanguageProbability> probabilities = languageScores.entrySet().stream()
.map(entry -> new LanguageProbability(entry.getKey(), entry.getValue() / 10.0))
.collect(Collectors.toList());
return new LanguageDetectionResult(detectedLang, 0.5, probabilities);
}
private List<LanguageProbability> getLanguageProbabilities() {
List<LanguageProbability> probabilities = new ArrayList<>();
// This is a simplified version - in reality, you'd get this from the detector
// For Optimaize detector, you might need to use different methods
probabilities.add(new LanguageProbability("en", 0.8));
probabilities.add(new LanguageProbability("de", 0.1));
probabilities.add(new LanguageProbability("fr", 0.05));
probabilities.add(new LanguageProbability("es", 0.05));
return probabilities;
}
public Map<String, List<String>> segmentByLanguage(List<String> documents) {
Map<String, List<String>> segmented = new HashMap<>();
for (String doc : documents) {
LanguageDetectionResult result = detectLanguage(doc);
String lang = result.getPrimaryLanguage();
segmented.computeIfAbsent(lang, k -> new ArrayList<>()).add(doc);
}
return segmented;
}
public void analyzeDocumentLanguages(Path documentDir) throws IOException {
BasicTikaParser parser = new BasicTikaParser();
Files.walk(documentDir)
.filter(Files::isRegularFile)
.forEach(file -> {
try {
String content = parser.extractTextFromFile(file.toFile());
LanguageDetectionResult result = detectLanguage(content);
System.out.println("File: " + file.getFileName());
System.out.println("Detected language: " + result.getPrimaryLanguage());
System.out.println("Confidence: " + result.getConfidence());
System.out.println("Content preview: " + 
content.substring(0, Math.min(100, content.length())));
System.out.println("---");
} catch (Exception e) {
System.out.println("Error processing " + file.getFileName() + ": " + e.getMessage());
}
});
}
}

Real-World Use Cases

Document Processing Pipeline

import java.io.*;
import java.nio.file.*;
import java.util.*;
import java.util.concurrent.*;
import java.util.stream.Collectors;
public class DocumentProcessingPipeline {
private final AdvancedTikaParser parser;
private final MetadataExtractor metadataExtractor;
private final TikaOCRProcessor ocrProcessor;
private final LanguageDetectionProcessor languageDetector;
private final ExecutorService executorService;
private final Set<String> supportedFormats;
public DocumentProcessingPipeline() throws Exception {
this.parser = new AdvancedTikaParser();
this.metadataExtractor = new MetadataExtractor();
this.ocrProcessor = new TikaOCRProcessor();
this.languageDetector = new LanguageDetectionProcessor();
this.executorService = Executors.newFixedThreadPool(
Runtime.getRuntime().availableProcessors()
);
this.supportedFormats = Set.of(
"application/pdf", "application/msword", 
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/vnd.ms-excel", 
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"text/plain", "text/html", "application/rtf",
"image/jpeg", "image/png", "image/tiff"
);
}
public static class ProcessingResult {
private final Path filePath;
private final String content;
private final MetadataExtractor.DocumentMetadata metadata;
private final String primaryLanguage;
private final boolean usedOCR;
private final long processingTimeMs;
public ProcessingResult(Path filePath, String content, 
MetadataExtractor.DocumentMetadata metadata,
String primaryLanguage, boolean usedOCR, 
long processingTimeMs) {
this.filePath = filePath;
this.content = content;
this.metadata = metadata;
this.primaryLanguage = primaryLanguage;
this.usedOCR = usedOCR;
this.processingTimeMs = processingTimeMs;
}
// Getters
public Path getFilePath() { return filePath; }
public String getContent() { return content; }
public MetadataExtractor.DocumentMetadata getMetadata() { return metadata; }
public String getPrimaryLanguage() { return primaryLanguage; }
public boolean isUsedOCR() { return usedOCR; }
public long getProcessingTimeMs() { return processingTimeMs; }
}
public CompletableFuture<ProcessingResult> processDocumentAsync(Path filePath) {
return CompletableFuture.supplyAsync(() -> {
long startTime = System.currentTimeMillis();
try {
// Step 1: Extract metadata
MetadataExtractor.DocumentMetadata metadata = 
metadataExtractor.extractMetadata(filePath);
// Check if format is supported
if (!isFormatSupported(metadata.getMediaType())) {
throw new UnsupportedOperationException(
"Unsupported format: " + metadata.getMediaType());
}
// Step 2: Extract content
String content;
boolean usedOCR = false;
if (isImageFile(metadata.getMediaType())) {
// Use OCR for images
TikaOCRProcessor.OCRResult ocrResult = 
ocrProcessor.extractTextWithOCRAdvanced(filePath.toFile());
if (!ocrResult.isSuccess()) {
throw new IOException("OCR failed: " + ocrResult.getErrorMessage());
}
content = ocrResult.getExtractedText();
usedOCR = true;
} else {
// Use regular parsing for documents
AdvancedTikaParser.ParseResult parseResult = 
parser.parseDocument(filePath);
if (parseResult.getStatus() != AdvancedTikaParser.ParseStatus.SUCCESS) {
throw new IOException("Parsing failed with status: " + parseResult.getStatus());
}
content = parseResult.getContent();
}
// Step 3: Detect language
String language = languageDetector.detectLanguage(content).getPrimaryLanguage();
long processingTime = System.currentTimeMillis() - startTime;
return new ProcessingResult(
filePath, content, metadata, language, usedOCR, processingTime
);
} catch (Exception e) {
throw new CompletionException(e);
}
}, executorService);
}
public List<ProcessingResult> processDocumentBatch(List<Path> files) {
List<CompletableFuture<ProcessingResult>> futures = files.stream()
.map(this::processDocumentAsync)
.collect(Collectors.toList());
CompletableFuture<Void> allFutures = CompletableFuture.allOf(
futures.toArray(new CompletableFuture[0])
);
return allFutures.thenApply(v -> 
futures.stream()
.map(CompletableFuture::join)
.collect(Collectors.toList())
).join();
}
public void processDirectoryWithProgress(Path directory) throws IOException {
List<Path> files = Files.walk(directory)
.filter(Files::isRegularFile)
.collect(Collectors.toList());
System.out.println("Found " + files.size() + " files to process");
List<CompletableFuture<ProcessingResult>> futures = files.stream()
.map(file -> processDocumentAsync(file)
.whenComplete((result, throwable) -> {
if (throwable != null) {
System.err.println("Failed to process " + file.getFileName() + 
": " + throwable.getMessage());
} else {
System.out.println("Processed: " + file.getFileName() + 
" (" + result.getProcessingTimeMs() + "ms)");
}
}))
.collect(Collectors.toList());
// Wait for all processing to complete
CompletableFuture.allOf(futures.toArray(new CompletableFuture[0])).join();
// Generate summary
generateProcessingSummary(futures);
}
private void generateProcessingSummary(List<CompletableFuture<ProcessingResult>> futures) {
List<ProcessingResult> results = futures.stream()
.filter(future -> future.isDone() && !future.isCompletedExceptionally())
.map(CompletableFuture::join)
.collect(Collectors.toList());
System.out.println("\n=== PROCESSING SUMMARY ===");
System.out.println("Total files processed: " + results.size());
Map<String, Long> formatCount = results.stream()
.collect(Collectors.groupingBy(
r -> r.getMetadata().getMediaType(),
Collectors.counting()
));
System.out.println("\nFormats processed:");
formatCount.forEach((format, count) -> 
System.out.println("  " + format + ": " + count));
Map<String, Long> languageCount = results.stream()
.collect(Collectors.groupingBy(
ProcessingResult::getPrimaryLanguage,
Collectors.counting()
));
System.out.println("\nLanguages detected:");
languageCount.forEach((lang, count) -> 
System.out.println("  " + lang + ": " + count));
long ocrCount = results.stream()
.filter(ProcessingResult::isUsedOCR)
.count();
System.out.println("\nOCR processed files: " + ocrCount);
double avgProcessingTime = results.stream()
.mapToLong(ProcessingResult::getProcessingTimeMs)
.average()
.orElse(0);
System.out.println("Average processing time: " + avgProcessingTime + "ms");
}
private boolean isFormatSupported(String mediaType) {
return supportedFormats.stream()
.anyMatch(supported -> mediaType != null && mediaType.startsWith(supported));
}
private boolean isImageFile(String mediaType) {
return mediaType != null && mediaType.startsWith("image/");
}
public void shutdown() {
executorService.shutdown();
try {
if (!executorService.awaitTermination(60, TimeUnit.SECONDS)) {
executorService.shutdownNow();
}
} catch (InterruptedException e) {
executorService.shutdownNow();
Thread.currentThread().interrupt();
}
}
}

Best Practices and Configuration

Tika Configuration Manager

import org.apache.tika.config.TikaConfig;
import org.apache.tika.parser.Parser;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
public class TikaConfigurationManager {
private TikaConfig tikaConfig;
public TikaConfigurationManager() throws Exception {
// Load default configuration
this.tikaConfig = TikaConfig.getDefaultConfig();
}
public TikaConfigurationManager(Path customConfigPath) throws Exception {
// Load custom configuration
try (InputStream configStream = Files.newInputStream(customConfigPath)) {
this.tikaConfig = new TikaConfig(configStream);
}
}
public Parser getParser() {
return tikaConfig.getParser();
}
public Set<String> getSupportedTypes() {
return tikaConfig.getMimeRepository().getRegisteredMimeTypes()
.stream()
.map(mimeType -> mimeType.getName())
.collect(Collectors.toSet());
}
public void printSupportedTypes() {
System.out.println("=== SUPPORTED MIME TYPES ===");
getSupportedTypes().stream()
.sorted()
.forEach(System.out::println);
}
public boolean isTypeSupported(String mimeType) {
return getSupportedTypes().contains(mimeType);
}
}

Apache Tika provides a comprehensive solution for document parsing and content extraction in Java applications. The examples above demonstrate how to leverage its capabilities for text extraction, metadata processing, OCR, and language detection in real-world scenarios.

Leave a Reply

Your email address will not be published. Required fields are marked *


Macro Nepal Helper