Natural Language Processing: Apache OpenNLP Integration in Java

Apache OpenNLP is a machine learning-based toolkit for natural language processing tasks. This comprehensive guide covers integration, model training, and practical implementations for text processing in Java applications.

Table of Contents

OpenNLP Architecture Overview

OpenNLP provides a suite of NLP tools with pre-trained models:

OpenNLP Processing Pipeline:
┌─────────────────┐    ┌──────────────────┐    ┌─────────────────┐
│  Raw Text       │ -> │  Sentence        │ -> │  Tokenization   │
│                 │    │  Detection       │    │                 │
└─────────────────┘    └──────────────────┘    └─────────────────┘
│                      │                      │
┌─────────────────┐    ┌──────────────────┐    ┌─────────────────┐
│  Part-of-Speech │    │  Named Entity    │    │  Parsing &      │
│  Tagging        │    │  Recognition     │    │  Chunking       │
└─────────────────┘    └──────────────────┘    └─────────────────┘

Project Setup and Dependencies

1. Maven Configuration

<!-- pom.xml -->
<properties>
<opennlp.version>2.3.0</opennlp.version>
</properties>
<dependencies>
<!-- OpenNLP Core -->
<dependency>
<groupId>org.apache.opennlp</groupId>
<artifactId>opennlp-tools</artifactId>
<version>${opennlp.version}</version>
</dependency>
<!-- OpenNLP Models -->
<dependency>
<groupId>org.apache.opennlp</groupId>
<artifactId>opennlp-uima</artifactId>
<version>${opennlp.version}</version>
</dependency>
<!-- Additional utilities -->
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>2.15.2</version>
</dependency>
<!-- Logging -->
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>2.0.7</version>
</dependency>
</dependencies>

2. Model Download Utility

package com.opennlp.utils;
import opennlp.tools.util.model.BaseModel;
import java.io.*;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
public class ModelManager {
private static final String MODEL_CACHE_DIR = "models/opennlp";
static {
// Create model directory if it doesn't exist
try {
Files.createDirectories(Paths.get(MODEL_CACHE_DIR));
} catch (IOException e) {
throw new RuntimeException("Failed to create model directory", e);
}
}
/**
* Download pre-trained models from Apache OpenNLP repository
*/
public static Path downloadModel(String modelName, String language) throws IOException {
String fileName = String.format("%s-%s.bin", modelName, language);
Path modelPath = Paths.get(MODEL_CACHE_DIR, fileName);
if (Files.exists(modelPath)) {
return modelPath;
}
String modelUrl = String.format(
"https://dlcdn.apache.org/opennlp/models/%s/1.5/%s",
modelName, fileName
);
System.out.println("Downloading model: " + modelUrl);
try (InputStream in = new URL(modelUrl).openStream()) {
Files.copy(in, modelPath, StandardCopyOption.REPLACE_EXISTING);
}
return modelPath;
}
/**
* Get common pre-trained models
*/
public static class Models {
public static final String SENTENCE_DETECTOR = "opennlp-en-ud-ewt-sentence-1.0-1.9.3";
public static final String TOKENIZER = "opennlp-en-ud-ewt-tokens-1.0-1.9.3";
public static final String POS_TAGGER = "opennlp-en-ud-ewt-pos-1.0-1.9.3";
public static final String NAME_FINDER = "en-ner-person.bin";
public static final String CHUNKER = "en-chunker.bin";
public static final String PARSER = "en-parser-chunking.bin";
}
}

Core NLP Processing Pipeline

1. Comprehensive Text Processing Service

package com.opennlp.core;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.POSTaggerME;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.chunker.ChunkerME;
import opennlp.tools.chunker.ChunkerModel;
import opennlp.tools.lemmatizer.LemmatizerME;
import opennlp.tools.lemmatizer.LemmatizerModel;
import opennlp.tools.parser.ParserModel;
import opennlp.tools.parser.Parser;
import opennlp.tools.parser.ParserFactory;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Path;
import java.util.*;
import java.util.stream.Collectors;
public class NLPProcessor implements AutoCloseable {
private SentenceDetectorME sentenceDetector;
private TokenizerME tokenizer;
private POSTaggerME posTagger;
private NameFinderME nameFinder;
private ChunkerME chunker;
private LemmatizerME lemmatizer;
private Parser parser;
private boolean initialized = false;
public void initialize() throws IOException {
if (initialized) return;
// Load all models
loadSentenceModel();
loadTokenizerModel();
loadPOSTaggerModel();
loadNameFinderModel();
loadChunkerModel();
loadLemmatizerModel();
loadParserModel();
initialized = true;
System.out.println("NLP Processor initialized successfully");
}
private void loadSentenceModel() throws IOException {
Path modelPath = ModelManager.downloadModel("sentence", "en");
try (InputStream modelIn = new FileInputStream(modelPath.toFile())) {
SentenceModel model = new SentenceModel(modelIn);
sentenceDetector = new SentenceDetectorME(model);
}
}
private void loadTokenizerModel() throws IOException {
Path modelPath = ModelManager.downloadModel("tokenizer", "en");
try (InputStream modelIn = new FileInputStream(modelPath.toFile())) {
TokenizerModel model = new TokenizerModel(modelIn);
tokenizer = new TokenizerME(model);
}
}
private void loadPOSTaggerModel() throws IOException {
Path modelPath = ModelManager.downloadModel("pos-tagger", "en");
try (InputStream modelIn = new FileInputStream(modelPath.toFile())) {
POSModel model = new POSModel(modelIn);
posTagger = new POSTaggerME(model);
}
}
private void loadNameFinderModel() throws IOException {
Path modelPath = ModelManager.downloadModel("ner-person", "en");
try (InputStream modelIn = new FileInputStream(modelPath.toFile())) {
TokenNameFinderModel model = new TokenNameFinderModel(modelIn);
nameFinder = new NameFinderME(model);
}
}
private void loadChunkerModel() throws IOException {
Path modelPath = ModelManager.downloadModel("chunker", "en");
try (InputStream modelIn = new FileInputStream(modelPath.toFile())) {
ChunkerModel model = new ChunkerModel(modelIn);
chunker = new ChunkerME(model);
}
}
private void loadLemmatizerModel() throws IOException {
Path modelPath = ModelManager.downloadModel("lemmatizer", "en");
try (InputStream modelIn = new FileInputStream(modelPath.toFile())) {
LemmatizerModel model = new LemmatizerModel(modelIn);
lemmatizer = new LemmatizerME(model);
}
}
private void loadParserModel() throws IOException {
Path modelPath = ModelManager.downloadModel("parser", "en");
try (InputStream modelIn = new FileInputStream(modelPath.toFile())) {
ParserModel model = new ParserModel(modelIn);
parser = ParserFactory.create(model);
}
}
/**
* Complete text analysis pipeline
*/
public TextAnalysisResult analyzeText(String text) {
if (!initialized) {
throw new IllegalStateException("NLP Processor not initialized");
}
TextAnalysisResult result = new TextAnalysisResult();
result.setOriginalText(text);
// Sentence detection
String[] sentences = detectSentences(text);
result.setSentences(sentences);
// Process each sentence
List<SentenceAnalysis> sentenceAnalyses = new ArrayList<>();
for (String sentence : sentences) {
SentenceAnalysis sentenceAnalysis = analyzeSentence(sentence);
sentenceAnalyses.add(sentenceAnalysis);
}
result.setSentenceAnalyses(sentenceAnalyses);
// Overall statistics
calculateStatistics(result);
return result;
}
/**
* Analyze individual sentence
*/
public SentenceAnalysis analyzeSentence(String sentence) {
SentenceAnalysis analysis = new SentenceAnalysis();
analysis.setOriginalSentence(sentence);
// Tokenization
String[] tokens = tokenize(sentence);
analysis.setTokens(tokens);
// Part-of-speech tagging
String[] posTags = tagPOS(tokens);
analysis.setPosTags(posTags);
// Named Entity Recognition
List<NamedEntity> namedEntities = findNamedEntities(tokens);
analysis.setNamedEntities(namedEntities);
// Chunking
String[] chunks = chunk(tokens, posTags);
analysis.setChunks(chunks);
// Lemmatization
String[] lemmas = lemmatize(tokens, posTags);
analysis.setLemmas(lemmas);
return analysis;
}
public String[] detectSentences(String text) {
return sentenceDetector.sentDetect(text);
}
public String[] tokenize(String text) {
return tokenizer.tokenize(text);
}
public String[] tagPOS(String[] tokens) {
return posTagger.tag(tokens);
}
public List<NamedEntity> findNamedEntities(String[] tokens) {
List<NamedEntity> entities = new ArrayList<>();
opennlp.tools.util.Span[] spans = nameFinder.find(tokens);
for (opennlp.tools.util.Span span : spans) {
String entity = String.join(" ", 
Arrays.copyOfRange(tokens, span.getStart(), span.getEnd()));
entities.add(new NamedEntity(entity, span.getType(), span.getStart(), span.getEnd()));
}
return entities;
}
public String[] chunk(String[] tokens, String[] posTags) {
return chunker.chunk(tokens, posTags);
}
public String[] lemmatize(String[] tokens, String[] posTags) {
return lemmatizer.lemmatize(tokens, posTags);
}
private void calculateStatistics(TextAnalysisResult result) {
int totalTokens = result.getSentenceAnalyses().stream()
.mapToInt(sa -> sa.getTokens().length)
.sum();
result.setTotalTokens(totalTokens);
int totalEntities = result.getSentenceAnalyses().stream()
.mapToInt(sa -> sa.getNamedEntities().size())
.sum();
result.setTotalNamedEntities(totalEntities);
// Calculate POS tag distribution
Map<String, Long> posDistribution = result.getSentenceAnalyses().stream()
.flatMap(sa -> Arrays.stream(sa.getPosTags()))
.collect(Collectors.groupingBy(tag -> tag, Collectors.counting()));
result.setPosDistribution(posDistribution);
}
@Override
public void close() {
// Clean up resources
sentenceDetector = null;
tokenizer = null;
posTagger = null;
nameFinder = null;
chunker = null;
lemmatizer = null;
parser = null;
initialized = false;
}
// Data transfer objects
public static class TextAnalysisResult {
private String originalText;
private String[] sentences;
private List<SentenceAnalysis> sentenceAnalyses;
private int totalTokens;
private int totalNamedEntities;
private Map<String, Long> posDistribution;
// Getters and setters
public String getOriginalText() { return originalText; }
public void setOriginalText(String originalText) { this.originalText = originalText; }
public String[] getSentences() { return sentences; }
public void setSentences(String[] sentences) { this.sentences = sentences; }
public List<SentenceAnalysis> getSentenceAnalyses() { return sentenceAnalyses; }
public void setSentenceAnalyses(List<SentenceAnalysis> sentenceAnalyses) { this.sentenceAnalyses = sentenceAnalyses; }
public int getTotalTokens() { return totalTokens; }
public void setTotalTokens(int totalTokens) { this.totalTokens = totalTokens; }
public int getTotalNamedEntities() { return totalNamedEntities; }
public void setTotalNamedEntities(int totalNamedEntities) { this.totalNamedEntities = totalNamedEntities; }
public Map<String, Long> getPosDistribution() { return posDistribution; }
public void setPosDistribution(Map<String, Long> posDistribution) { this.posDistribution = posDistribution; }
}
public static class SentenceAnalysis {
private String originalSentence;
private String[] tokens;
private String[] posTags;
private String[] lemmas;
private String[] chunks;
private List<NamedEntity> namedEntities;
// Getters and setters
public String getOriginalSentence() { return originalSentence; }
public void setOriginalSentence(String originalSentence) { this.originalSentence = originalSentence; }
public String[] getTokens() { return tokens; }
public void setTokens(String[] tokens) { this.tokens = tokens; }
public String[] getPosTags() { return posTags; }
public void setPosTags(String[] posTags) { this.posTags = posTags; }
public String[] getLemmas() { return lemmas; }
public void setLemmas(String[] lemmas) { this.lemmas = lemmas; }
public String[] getChunks() { return chunks; }
public void setChunks(String[] chunks) { this.chunks = chunks; }
public List<NamedEntity> getNamedEntities() { return namedEntities; }
public void setNamedEntities(List<NamedEntity> namedEntities) { this.namedEntities = namedEntities; }
}
public static class NamedEntity {
private final String text;
private final String type;
private final int start;
private final int end;
public NamedEntity(String text, String type, int start, int end) {
this.text = text;
this.type = type;
this.start = start;
this.end = end;
}
// Getters
public String getText() { return text; }
public String getType() { return type; }
public int getStart() { return start; }
public int getEnd() { return end; }
}
}

Custom Model Training

1. Named Entity Recognition Training

package com.opennlp.training;
import opennlp.tools.namefind.*;
import opennlp.tools.util.*;
import opennlp.tools.util.eval.FMeasure;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Collections;
public class NERTrainer {
/**
* Train a custom Named Entity Recognition model
*/
public void trainNERModel(Path trainingDataPath, Path modelOutputPath, 
String language) throws IOException {
// Prepare training parameters
TrainingParameters params = new TrainingParameters();
params.put(TrainingParameters.ITERATIONS_PARAM, "100");
params.put(TrainingParameters.CUTOFF_PARAM, "5");
params.put(TrainingParameters.ALGORITHM_PARAM, "MAXENT");
// Prepare training data
ObjectStream<NameSample> sampleStream = readTrainingData(trainingDataPath);
// Train the model
try {
TokenNameFinderModel model = NameFinderME.train(
language, 
"person", // entity type
sampleStream, 
params, 
new TokenNameFinderFactory()
);
// Save the model
try (OutputStream modelOut = Files.newOutputStream(modelOutputPath)) {
model.serialize(modelOut);
}
System.out.println("NER model trained and saved to: " + modelOutputPath);
} finally {
sampleStream.close();
}
}
private ObjectStream<NameSample> readTrainingData(Path trainingDataPath) throws IOException {
InputStream dataIn = Files.newInputStream(trainingDataPath);
return new NameSampleDataStream(
new PlainTextByLineStream(dataIn, StandardCharsets.UTF_8)
);
}
/**
* Evaluate model performance
*/
public ModelEvaluation evaluateNERModel(Path modelPath, Path testDataPath) throws IOException {
try (InputStream modelIn = Files.newInputStream(modelPath)) {
TokenNameFinderModel model = new TokenNameFinderModel(modelIn);
NameFinderME nameFinder = new NameFinderME(model);
ObjectStream<NameSample> testStream = readTrainingData(testDataPath);
FMeasure fmeasure = new FMeasure();
NameSample testSample;
while ((testSample = testStream.read()) != null) {
String[] tokens = testSample.getSentence();
opennlp.tools.util.Span[] predictedNames = nameFinder.find(tokens);
opennlp.tools.util.Span[] expectedNames = testSample.getNames();
fmeasure.updateScores(expectedNames, predictedNames);
}
testStream.close();
return new ModelEvaluation(
fmeasure.getPrecisionScore(),
fmeasure.getRecallScore(),
fmeasure.getFMeasure()
);
}
}
/**
* Generate training data in OpenNLP format
*/
public void generateTrainingData(List<NamedEntityExample> examples, Path outputPath) 
throws IOException {
try (BufferedWriter writer = Files.newBufferedWriter(outputPath, StandardCharsets.UTF_8)) {
for (NamedEntityExample example : examples) {
writer.write(example.toTrainingFormat());
writer.newLine();
}
}
}
public static class NamedEntityExample {
private final String sentence;
private final List<EntitySpan> entities;
public NamedEntityExample(String sentence, List<EntitySpan> entities) {
this.sentence = sentence;
this.entities = entities;
}
public String toTrainingFormat() {
String[] tokens = sentence.split("\\s+");
StringBuilder sb = new StringBuilder();
for (int i = 0; i < tokens.length; i++) {
sb.append(tokens[i]);
// Check if current token starts an entity
for (EntitySpan entity : entities) {
if (entity.start == i) {
sb.append(" <START:").append(entity.type).append(">");
} else if (entity.end == i) {
sb.append(" <END>");
}
}
sb.append(" ");
}
return sb.toString().trim();
}
}
public static class EntitySpan {
public final int start;
public final int end;
public final String type;
public EntitySpan(int start, int end, String type) {
this.start = start;
this.end = end;
this.type = type;
}
}
public static class ModelEvaluation {
private final double precision;
private final double recall;
private final double f1Score;
public ModelEvaluation(double precision, double recall, double f1Score) {
this.precision = precision;
this.recall = recall;
this.f1Score = f1Score;
}
// Getters
public double getPrecision() { return precision; }
public double getRecall() { return recall; }
public double getF1Score() { return f1Score; }
@Override
public String toString() {
return String.format("Precision: %.3f, Recall: %.3f, F1: %.3f", 
precision, recall, f1Score);
}
}
}

2. Sentiment Analysis Training

package com.opennlp.training;
import opennlp.tools.doccategorizer.*;
import opennlp.tools.util.*;
import opennlp.tools.util.eval.FMeasure;
import java.io.*;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
public class SentimentTrainer {
/**
* Train a sentiment analysis model
*/
public void trainSentimentModel(List<SentimentExample> trainingExamples, 
Path modelOutputPath) throws IOException {
// Prepare training data
ObjectStream<DocumentSample> sampleStream = 
new CollectionObjectStream<>(convertToDocumentSamples(trainingExamples));
// Training parameters
TrainingParameters params = new TrainingParameters();
params.put(TrainingParameters.ITERATIONS_PARAM, "100");
params.put(TrainingParameters.CUTOFF_PARAM, "2");
params.put(TrainingParameters.ALGORITHM_PARAM, "MAXENT");
// Feature generation
DoccatFactory factory = new DoccatFactory(new FeatureGenerator[]{
new BagOfWordsFeatureGenerator(),
new NGramFeatureGenerator(),
new NGramFeatureGenerator(1, 2) // bigrams
});
// Train model
DoccatModel model = DocumentCategorizerME.train(
"en", sampleStream, params, factory
);
// Save model
try (OutputStream modelOut = Files.newOutputStream(modelOutputPath)) {
model.serialize(modelOut);
}
System.out.println("Sentiment model trained and saved to: " + modelOutputPath);
}
private List<DocumentSample> convertToDocumentSamples(List<SentimentExample> examples) {
List<DocumentSample> samples = new ArrayList<>();
for (SentimentExample example : examples) {
samples.add(new DocumentSample(example.getCategory(), example.getText()));
}
return samples;
}
/**
* Cross-validate sentiment model
*/
public CrossValidationResult crossValidate(List<SentimentExample> examples, int folds) 
throws IOException {
CrossValidationPartitioner<DocumentSample> partitioner = 
new CrossValidationPartitioner<>(
convertToDocumentSamples(examples), folds
);
Map<String, FMeasure> fmeasures = new HashMap<>();
while (partitioner.hasNext()) {
CrossValidationPartitioner.TrainingSampleStream<DocumentSample> trainingStream = 
partitioner.next();
// Train on current fold
DoccatModel model = DocumentCategorizerME.train(
"en", trainingStream, ModelUtil.createDefaultParameters(), 
new DoccatFactory()
);
DocumentCategorizerME categorizer = new DocumentCategorizerME(model);
// Evaluate on test set
DocumentSample testSample;
while ((testSample = trainingStream.readTestSample()) != null) {
String[] tokens = testSample.getText().split("\\s+");
double[] outcomes = categorizer.categorize(tokens);
String predictedCategory = categorizer.getBestCategory(outcomes);
FMeasure fm = fmeasures.getOrDefault(testSample.getCategory(), new FMeasure());
fm.updateScores(testSample.getCategory(), predictedCategory);
fmeasures.put(testSample.getCategory(), fm);
}
}
return new CrossValidationResult(fmeasures);
}
public static class SentimentExample {
private final String text;
private final String category; // "positive", "negative", "neutral"
public SentimentExample(String text, String category) {
this.text = text;
this.category = category;
}
// Getters
public String getText() { return text; }
public String getCategory() { return category; }
}
public static class CrossValidationResult {
private final Map<String, CategoryMetrics> categoryMetrics;
private final double overallAccuracy;
public CrossValidationResult(Map<String, FMeasure> fmeasures) {
this.categoryMetrics = new HashMap<>();
double totalPrecision = 0;
double totalRecall = 0;
double totalF1 = 0;
int count = 0;
for (Map.Entry<String, FMeasure> entry : fmeasures.entrySet()) {
FMeasure fm = entry.getValue();
CategoryMetrics metrics = new CategoryMetrics(
fm.getPrecisionScore(),
fm.getRecallScore(),
fm.getFMeasure()
);
categoryMetrics.put(entry.getKey(), metrics);
totalPrecision += fm.getPrecisionScore();
totalRecall += fm.getRecallScore();
totalF1 += fm.getFMeasure();
count++;
}
this.overallAccuracy = count > 0 ? totalF1 / count : 0;
}
// Getters
public Map<String, CategoryMetrics> getCategoryMetrics() { return categoryMetrics; }
public double getOverallAccuracy() { return overallAccuracy; }
}
public static class CategoryMetrics {
private final double precision;
private final double recall;
private final double f1Score;
public CategoryMetrics(double precision, double recall, double f1Score) {
this.precision = precision;
this.recall = recall;
this.f1Score = f1Score;
}
// Getters
public double getPrecision() { return precision; }
public double getRecall() { return recall; }
public double getF1Score() { return f1Score; }
}
}

Real-World Applications

1. Document Classification System

package com.opennlp.applications;
import opennlp.tools.doccategorizer.DocumentCategorizerME;
import opennlp.tools.doccategorizer.DoccatModel;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import java.io.*;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
import java.util.stream.Collectors;
public class DocumentClassifier {
private DocumentCategorizerME categorizer;
private TokenizerME tokenizer;
private Map<String, Double> confidenceThresholds;
public void initialize(Path modelPath, Path tokenizerModelPath) throws IOException {
try (InputStream modelIn = Files.newInputStream(modelPath)) {
DoccatModel model = new DoccatModel(modelIn);
categorizer = new DocumentCategorizerME(model);
}
try (InputStream tokenizerIn = Files.newInputStream(tokenizerModelPath)) {
TokenizerModel tokenizerModel = new TokenizerModel(tokenizerIn);
tokenizer = new TokenizerME(tokenizerModel);
}
// Set confidence thresholds for different categories
confidenceThresholds = new HashMap<>();
confidenceThresholds.put("urgent", 0.8);
confidenceThresholds.put("important", 0.7);
confidenceThresholds.put("normal", 0.5);
confidenceThresholds.put("low", 0.3);
}
public ClassificationResult classifyDocument(String documentText) {
String[] tokens = tokenizer.tokenize(documentText);
double[] outcomes = categorizer.categorize(tokens);
String bestCategory = categorizer.getBestCategory(outcomes);
double confidence = outcomes[categorizer.getIndex(bestCategory)];
// Apply confidence threshold
double threshold = confidenceThresholds.getOrDefault(bestCategory, 0.5);
if (confidence < threshold) {
bestCategory = "uncertain";
}
// Get all categories with scores
Map<String, Double> allScores = new HashMap<>();
for (String category : categorizer.getCategories()) {
int index = categorizer.getIndex(category);
allScores.put(category, outcomes[index]);
}
return new ClassificationResult(bestCategory, confidence, allScores);
}
public List<DocumentClassification> batchClassify(List<Document> documents) {
return documents.parallelStream()
.map(doc -> {
ClassificationResult result = classifyDocument(doc.getContent());
return new DocumentClassification(doc, result);
})
.collect(Collectors.toList());
}
/**
* Extract key phrases from document based on classification
*/
public List<String> extractKeyPhrases(String documentText, String category) {
String[] sentences = documentText.split("[.!?]+");
List<String> keyPhrases = new ArrayList<>();
for (String sentence : sentences) {
ClassificationResult result = classifyDocument(sentence);
if (result.getCategory().equals(category) && 
result.getConfidence() > 0.7) {
keyPhrases.add(sentence.trim());
}
}
return keyPhrases;
}
public static class ClassificationResult {
private final String category;
private final double confidence;
private final Map<String, Double> allScores;
public ClassificationResult(String category, double confidence, 
Map<String, Double> allScores) {
this.category = category;
this.confidence = confidence;
this.allScores = Collections.unmodifiableMap(allScores);
}
// Getters
public String getCategory() { return category; }
public double getConfidence() { return confidence; }
public Map<String, Double> getAllScores() { return allScores; }
}
public static class Document {
private final String id;
private final String content;
private final String title;
public Document(String id, String title, String content) {
this.id = id;
this.title = title;
this.content = content;
}
// Getters
public String getId() { return id; }
public String getContent() { return content; }
public String getTitle() { return title; }
}
public static class DocumentClassification {
private final Document document;
private final ClassificationResult classification;
public DocumentClassification(Document document, ClassificationResult classification) {
this.document = document;
this.classification = classification;
}
// Getters
public Document getDocument() { return document; }
public ClassificationResult getClassification() { return classification; }
}
}

2. Intelligent Chatbot with NLP

package com.opennlp.applications;
import com.opennlp.core.NLPProcessor;
import opennlp.tools.doccategorizer.DocumentCategorizerME;
import opennlp.tools.doccategorizer.DoccatModel;
import java.io.*;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
import java.util.regex.Pattern;
public class IntelligentChatbot {
private NLPProcessor nlpProcessor;
private DocumentCategorizerME intentClassifier;
private Map<String, ResponseGenerator> responseGenerators;
private ChatHistory chatHistory;
public void initialize() throws Exception {
nlpProcessor = new NLPProcessor();
nlpProcessor.initialize();
// Load intent classification model
Path intentModelPath = Path.of("models/chatbot/intent-classifier.bin");
try (InputStream modelIn = Files.newInputStream(intentModelPath)) {
DoccatModel model = new DoccatModel(modelIn);
intentClassifier = new DocumentCategorizerME(model);
}
initializeResponseGenerators();
chatHistory = new ChatHistory();
}
private void initializeResponseGenerators() {
responseGenerators = new HashMap<>();
responseGenerators.put("greeting", new GreetingResponseGenerator());
responseGenerators.put("question", new QuestionResponseGenerator());
responseGenerators.put("complaint", new ComplaintResponseGenerator());
responseGenerators.put("feedback", new FeedbackResponseGenerator());
responseGenerators.put("goodbye", new GoodbyeResponseGenerator());
}
public ChatResponse processMessage(String userMessage, String userId) {
try {
// Analyze message with NLP
NLPProcessor.TextAnalysisResult analysis = nlpProcessor.analyzeText(userMessage);
// Classify intent
String intent = classifyIntent(userMessage);
// Extract entities
List<String> entities = extractEntities(analysis);
// Generate response
String response = generateResponse(intent, userMessage, entities, userId);
// Update chat history
chatHistory.addMessage(userId, userMessage, response, intent);
return new ChatResponse(response, intent, entities, analysis.getSentences());
} catch (Exception e) {
return new ChatResponse(
"I apologize, but I encountered an error processing your message.", 
"error", Collections.emptyList(), new String[0]
);
}
}
private String classifyIntent(String message) {
String[] tokens = nlpProcessor.tokenize(message);
double[] outcomes = intentClassifier.categorize(tokens);
return intentClassifier.getBestCategory(outcomes);
}
private List<String> extractEntities(NLPProcessor.TextAnalysisResult analysis) {
return analysis.getSentenceAnalyses().stream()
.flatMap(sa -> sa.getNamedEntities().stream())
.map(NLPProcessor.NamedEntity::getText)
.collect(Collectors.toList());
}
private String generateResponse(String intent, String userMessage, 
List<String> entities, String userId) {
ResponseGenerator generator = responseGenerators.get(intent);
if (generator != null) {
return generator.generateResponse(userMessage, entities, userId, chatHistory);
}
return "I understand you're saying: \"" + userMessage + 
"\". Could you please rephrase that?";
}
public ChatStatistics getChatStatistics(String userId) {
return chatHistory.getStatistics(userId);
}
// Response generator interfaces
public interface ResponseGenerator {
String generateResponse(String userMessage, List<String> entities, 
String userId, ChatHistory history);
}
public static class GreetingResponseGenerator implements ResponseGenerator {
private final Random random = new Random();
private final List<String> greetings = Arrays.asList(
"Hello! How can I help you today?",
"Hi there! What can I do for you?",
"Greetings! How may I assist you?",
"Welcome! How can I be of service?"
);
@Override
public String generateResponse(String userMessage, List<String> entities, 
String userId, ChatHistory history) {
return greetings.get(random.nextInt(greetings.size()));
}
}
public static class QuestionResponseGenerator implements ResponseGenerator {
@Override
public String generateResponse(String userMessage, List<String> entities, 
String userId, ChatHistory history) {
if (!entities.isEmpty()) {
return "Regarding " + entities.get(0) + ", I'd be happy to help. " +
"Could you provide more specific details about your question?";
}
return "That's an interesting question. Let me think about how best to assist you.";
}
}
// Chat history management
public static class ChatHistory {
private final Map<String, List<ChatMessage>> userHistories;
private final int maxHistorySize = 100;
public ChatHistory() {
this.userHistories = new HashMap<>();
}
public void addMessage(String userId, String userMessage, String botResponse, String intent) {
List<ChatMessage> history = userHistories.computeIfAbsent(userId, k -> new ArrayList<>());
history.add(new ChatMessage(userMessage, botResponse, intent, System.currentTimeMillis()));
// Limit history size
if (history.size() > maxHistorySize) {
history.remove(0);
}
}
public List<ChatMessage> getUserHistory(String userId) {
return Collections.unmodifiableList(
userHistories.getOrDefault(userId, Collections.emptyList())
);
}
public ChatStatistics getStatistics(String userId) {
List<ChatMessage> history = getUserHistory(userId);
Map<String, Long> intentCounts = history.stream()
.collect(Collectors.groupingBy(ChatMessage::getIntent, Collectors.counting()));
return new ChatStatistics(history.size(), intentCounts);
}
}
public static class ChatMessage {
private final String userMessage;
private final String botResponse;
private final String intent;
private final long timestamp;
public ChatMessage(String userMessage, String botResponse, String intent, long timestamp) {
this.userMessage = userMessage;
this.botResponse = botResponse;
this.intent = intent;
this.timestamp = timestamp;
}
// Getters
public String getUserMessage() { return userMessage; }
public String getBotResponse() { return botResponse; }
public String getIntent() { return intent; }
public long getTimestamp() { return timestamp; }
}
public static class ChatResponse {
private final String response;
private final String intent;
private final List<String> entities;
private final String[] sentences;
public ChatResponse(String response, String intent, List<String> entities, String[] sentences) {
this.response = response;
this.intent = intent;
this.entities = entities;
this.sentences = sentences;
}
// Getters
public String getResponse() { return response; }
public String getIntent() { return intent; }
public List<String> getEntities() { return entities; }
public String[] getSentences() { return sentences; }
}
public static class ChatStatistics {
private final int totalMessages;
private final Map<String, Long> intentDistribution;
public ChatStatistics(int totalMessages, Map<String, Long> intentDistribution) {
this.totalMessages = totalMessages;
this.intentDistribution = Collections.unmodifiableMap(intentDistribution);
}
// Getters
public int getTotalMessages() { return totalMessages; }
public Map<String, Long> getIntentDistribution() { return intentDistribution; }
}
}

Performance Optimization

1. Caching and Performance Monitoring

package com.opennlp.optimization;
import com.opennlp.core.NLPProcessor;
import java.util.*;
import java.util.concurrent.*;
public class OptimizedNLPProcessor {
private final NLPProcessor nlpProcessor;
private final Map<String, NLPProcessor.TextAnalysisResult> analysisCache;
private final ExecutorService processingPool;
private final PerformanceMonitor performanceMonitor;
private final int cacheSize;
public OptimizedNLPProcessor(int cacheSize, int threadPoolSize) throws Exception {
this.nlpProcessor = new NLPProcessor();
this.nlpProcessor.initialize();
this.cacheSize = cacheSize;
this.analysisCache = new LinkedHashMap<String, NLPProcessor.TextAnalysisResult>() {
@Override
protected boolean removeEldestEntry(Map.Entry<String, NLPProcessor.TextAnalysisResult> eldest) {
return size() > cacheSize;
}
};
this.processingPool = Executors.newFixedThreadPool(threadPoolSize);
this.performanceMonitor = new PerformanceMonitor();
}
public CompletableFuture<NLPProcessor.TextAnalysisResult> analyzeTextAsync(String text) {
String cacheKey = generateCacheKey(text);
// Check cache first
synchronized (analysisCache) {
NLPProcessor.TextAnalysisResult cached = analysisCache.get(cacheKey);
if (cached != null) {
performanceMonitor.recordCacheHit();
return CompletableFuture.completedFuture(cached);
}
}
performanceMonitor.recordCacheMiss();
return CompletableFuture.supplyAsync(() -> {
long startTime = System.nanoTime();
try {
NLPProcessor.TextAnalysisResult result = nlpProcessor.analyzeText(text);
// Cache the result
synchronized (analysisCache) {
analysisCache.put(cacheKey, result);
}
long processingTime = System.nanoTime() - startTime;
performanceMonitor.recordProcessingTime(processingTime);
return result;
} catch (Exception e) {
performanceMonitor.recordError();
throw new CompletionException("Text analysis failed", e);
}
}, processingPool);
}
public List<CompletableFuture<NLPProcessor.TextAnalysisResult>> analyzeBatchAsync(
List<String> texts) {
return texts.stream()
.map(this::analyzeTextAsync)
.collect(Collectors.toList());
}
private String generateCacheKey(String text) {
// Simple hash-based cache key
return Integer.toHexString(text.hashCode());
}
public PerformanceStats getPerformanceStats() {
return performanceMonitor.getStats();
}
public void clearCache() {
synchronized (analysisCache) {
analysisCache.clear();
}
}
public void shutdown() {
processingPool.shutdown();
try {
if (!processingPool.awaitTermination(5, TimeUnit.SECONDS)) {
processingPool.shutdownNow();
}
} catch (InterruptedException e) {
processingPool.shutdownNow();
Thread.currentThread().interrupt();
}
nlpProcessor.close();
}
public static class PerformanceMonitor {
private final AtomicLong cacheHits = new AtomicLong();
private final AtomicLong cacheMisses = new AtomicLong();
private final AtomicLong totalProcessingTime = new AtomicLong();
private final AtomicLong totalProcessedTexts = new AtomicLong();
private final AtomicLong errors = new AtomicLong();
public void recordCacheHit() {
cacheHits.incrementAndGet();
}
public void recordCacheMiss() {
cacheMisses.incrementAndGet();
}
public void recordProcessingTime(long nanos) {
totalProcessingTime.addAndGet(nanos);
totalProcessedTexts.incrementAndGet();
}
public void recordError() {
errors.incrementAndGet();
}
public PerformanceStats getStats() {
long hits = cacheHits.get();
long misses = cacheMisses.get();
long totalRequests = hits + misses;
double hitRatio = totalRequests > 0 ? (double) hits / totalRequests : 0.0;
double avgProcessingTime = totalProcessedTexts.get() > 0 ? 
(double) totalProcessingTime.get() / totalProcessedTexts.get() / 1_000_000 : 0.0;
return new PerformanceStats(
hits, misses, hitRatio, avgProcessingTime, errors.get()
);
}
}
public static class PerformanceStats {
private final long cacheHits;
private final long cacheMisses;
private final double cacheHitRatio;
private final double averageProcessingTimeMs;
private final long errorCount;
public PerformanceStats(long cacheHits, long cacheMisses, double cacheHitRatio,
double averageProcessingTimeMs, long errorCount) {
this.cacheHits = cacheHits;
this.cacheMisses = cacheMisses;
this.cacheHitRatio = cacheHitRatio;
this.averageProcessingTimeMs = averageProcessingTimeMs;
this.errorCount = errorCount;
}
// Getters
public long getCacheHits() { return cacheHits; }
public long getCacheMisses() { return cacheMisses; }
public double getCacheHitRatio() { return cacheHitRatio; }
public double getAverageProcessingTimeMs() { return averageProcessingTimeMs; }
public long getErrorCount() { return errorCount; }
@Override
public String toString() {
return String.format(
"Cache: %.1f%% hit ratio, Avg Time: %.2fms, Errors: %d",
cacheHitRatio * 100, averageProcessingTimeMs, errorCount
);
}
}
}

Conclusion

Apache OpenNLP provides robust NLP capabilities for Java applications:

Key Features:

Sentence Detection - Split text into sentences
Tokenization - Break text into words/tokens
Part-of-Speech Tagging - Identify grammatical components
Named Entity Recognition - Extract people, organizations, locations
Text Classification - Categorize documents and sentences
Model Training - Custom model development

Best Practices:

Use caching for frequently processed texts
Implement async processing for better performance
Monitor model performance and accuracy
Combine rule-based and ML-based approaches
Handle multiple languages with appropriate models

Use Cases:

Document processing and classification
Chatbots and virtual assistants
Sentiment analysis for social media
Information extraction from text
Content recommendation systems

OpenNLP's combination of pre-trained models and custom training capabilities makes it suitable for both rapid prototyping and production-grade NLP applications.