Natural Language Processing: Apache OpenNLP Integration in Java

Apache OpenNLP is a machine learning-based toolkit for natural language processing tasks. This comprehensive guide covers integration, model training, and practical implementations for text processing in Java applications.


OpenNLP Architecture Overview

OpenNLP provides a suite of NLP tools with pre-trained models:

OpenNLP Processing Pipeline:
┌─────────────────┐    ┌──────────────────┐    ┌─────────────────┐
│  Raw Text       │ -> │  Sentence        │ -> │  Tokenization   │
│                 │    │  Detection       │    │                 │
└─────────────────┘    └──────────────────┘    └─────────────────┘
│                      │                      │
┌─────────────────┐    ┌──────────────────┐    ┌─────────────────┐
│  Part-of-Speech │    │  Named Entity    │    │  Parsing &      │
│  Tagging        │    │  Recognition     │    │  Chunking       │
└─────────────────┘    └──────────────────┘    └─────────────────┘

Project Setup and Dependencies

1. Maven Configuration

<!-- pom.xml -->
<properties>
<opennlp.version>2.3.0</opennlp.version>
</properties>
<dependencies>
<!-- OpenNLP Core -->
<dependency>
<groupId>org.apache.opennlp</groupId>
<artifactId>opennlp-tools</artifactId>
<version>${opennlp.version}</version>
</dependency>
<!-- OpenNLP Models -->
<dependency>
<groupId>org.apache.opennlp</groupId>
<artifactId>opennlp-uima</artifactId>
<version>${opennlp.version}</version>
</dependency>
<!-- Additional utilities -->
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>2.15.2</version>
</dependency>
<!-- Logging -->
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>2.0.7</version>
</dependency>
</dependencies>

2. Model Download Utility

package com.opennlp.utils;
import opennlp.tools.util.model.BaseModel;
import java.io.*;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
public class ModelManager {
private static final String MODEL_CACHE_DIR = "models/opennlp";
static {
// Create model directory if it doesn't exist
try {
Files.createDirectories(Paths.get(MODEL_CACHE_DIR));
} catch (IOException e) {
throw new RuntimeException("Failed to create model directory", e);
}
}
/**
* Download pre-trained models from Apache OpenNLP repository
*/
public static Path downloadModel(String modelName, String language) throws IOException {
String fileName = String.format("%s-%s.bin", modelName, language);
Path modelPath = Paths.get(MODEL_CACHE_DIR, fileName);
if (Files.exists(modelPath)) {
return modelPath;
}
String modelUrl = String.format(
"https://dlcdn.apache.org/opennlp/models/%s/1.5/%s",
modelName, fileName
);
System.out.println("Downloading model: " + modelUrl);
try (InputStream in = new URL(modelUrl).openStream()) {
Files.copy(in, modelPath, StandardCopyOption.REPLACE_EXISTING);
}
return modelPath;
}
/**
* Get common pre-trained models
*/
public static class Models {
public static final String SENTENCE_DETECTOR = "opennlp-en-ud-ewt-sentence-1.0-1.9.3";
public static final String TOKENIZER = "opennlp-en-ud-ewt-tokens-1.0-1.9.3";
public static final String POS_TAGGER = "opennlp-en-ud-ewt-pos-1.0-1.9.3";
public static final String NAME_FINDER = "en-ner-person.bin";
public static final String CHUNKER = "en-chunker.bin";
public static final String PARSER = "en-parser-chunking.bin";
}
}

Core NLP Processing Pipeline

1. Comprehensive Text Processing Service

package com.opennlp.core;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.POSTaggerME;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.chunker.ChunkerME;
import opennlp.tools.chunker.ChunkerModel;
import opennlp.tools.lemmatizer.LemmatizerME;
import opennlp.tools.lemmatizer.LemmatizerModel;
import opennlp.tools.parser.ParserModel;
import opennlp.tools.parser.Parser;
import opennlp.tools.parser.ParserFactory;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Path;
import java.util.*;
import java.util.stream.Collectors;
public class NLPProcessor implements AutoCloseable {
private SentenceDetectorME sentenceDetector;
private TokenizerME tokenizer;
private POSTaggerME posTagger;
private NameFinderME nameFinder;
private ChunkerME chunker;
private LemmatizerME lemmatizer;
private Parser parser;
private boolean initialized = false;
public void initialize() throws IOException {
if (initialized) return;
// Load all models
loadSentenceModel();
loadTokenizerModel();
loadPOSTaggerModel();
loadNameFinderModel();
loadChunkerModel();
loadLemmatizerModel();
loadParserModel();
initialized = true;
System.out.println("NLP Processor initialized successfully");
}
private void loadSentenceModel() throws IOException {
Path modelPath = ModelManager.downloadModel("sentence", "en");
try (InputStream modelIn = new FileInputStream(modelPath.toFile())) {
SentenceModel model = new SentenceModel(modelIn);
sentenceDetector = new SentenceDetectorME(model);
}
}
private void loadTokenizerModel() throws IOException {
Path modelPath = ModelManager.downloadModel("tokenizer", "en");
try (InputStream modelIn = new FileInputStream(modelPath.toFile())) {
TokenizerModel model = new TokenizerModel(modelIn);
tokenizer = new TokenizerME(model);
}
}
private void loadPOSTaggerModel() throws IOException {
Path modelPath = ModelManager.downloadModel("pos-tagger", "en");
try (InputStream modelIn = new FileInputStream(modelPath.toFile())) {
POSModel model = new POSModel(modelIn);
posTagger = new POSTaggerME(model);
}
}
private void loadNameFinderModel() throws IOException {
Path modelPath = ModelManager.downloadModel("ner-person", "en");
try (InputStream modelIn = new FileInputStream(modelPath.toFile())) {
TokenNameFinderModel model = new TokenNameFinderModel(modelIn);
nameFinder = new NameFinderME(model);
}
}
private void loadChunkerModel() throws IOException {
Path modelPath = ModelManager.downloadModel("chunker", "en");
try (InputStream modelIn = new FileInputStream(modelPath.toFile())) {
ChunkerModel model = new ChunkerModel(modelIn);
chunker = new ChunkerME(model);
}
}
private void loadLemmatizerModel() throws IOException {
Path modelPath = ModelManager.downloadModel("lemmatizer", "en");
try (InputStream modelIn = new FileInputStream(modelPath.toFile())) {
LemmatizerModel model = new LemmatizerModel(modelIn);
lemmatizer = new LemmatizerME(model);
}
}
private void loadParserModel() throws IOException {
Path modelPath = ModelManager.downloadModel("parser", "en");
try (InputStream modelIn = new FileInputStream(modelPath.toFile())) {
ParserModel model = new ParserModel(modelIn);
parser = ParserFactory.create(model);
}
}
/**
* Complete text analysis pipeline
*/
public TextAnalysisResult analyzeText(String text) {
if (!initialized) {
throw new IllegalStateException("NLP Processor not initialized");
}
TextAnalysisResult result = new TextAnalysisResult();
result.setOriginalText(text);
// Sentence detection
String[] sentences = detectSentences(text);
result.setSentences(sentences);
// Process each sentence
List<SentenceAnalysis> sentenceAnalyses = new ArrayList<>();
for (String sentence : sentences) {
SentenceAnalysis sentenceAnalysis = analyzeSentence(sentence);
sentenceAnalyses.add(sentenceAnalysis);
}
result.setSentenceAnalyses(sentenceAnalyses);
// Overall statistics
calculateStatistics(result);
return result;
}
/**
* Analyze individual sentence
*/
public SentenceAnalysis analyzeSentence(String sentence) {
SentenceAnalysis analysis = new SentenceAnalysis();
analysis.setOriginalSentence(sentence);
// Tokenization
String[] tokens = tokenize(sentence);
analysis.setTokens(tokens);
// Part-of-speech tagging
String[] posTags = tagPOS(tokens);
analysis.setPosTags(posTags);
// Named Entity Recognition
List<NamedEntity> namedEntities = findNamedEntities(tokens);
analysis.setNamedEntities(namedEntities);
// Chunking
String[] chunks = chunk(tokens, posTags);
analysis.setChunks(chunks);
// Lemmatization
String[] lemmas = lemmatize(tokens, posTags);
analysis.setLemmas(lemmas);
return analysis;
}
public String[] detectSentences(String text) {
return sentenceDetector.sentDetect(text);
}
public String[] tokenize(String text) {
return tokenizer.tokenize(text);
}
public String[] tagPOS(String[] tokens) {
return posTagger.tag(tokens);
}
public List<NamedEntity> findNamedEntities(String[] tokens) {
List<NamedEntity> entities = new ArrayList<>();
opennlp.tools.util.Span[] spans = nameFinder.find(tokens);
for (opennlp.tools.util.Span span : spans) {
String entity = String.join(" ", 
Arrays.copyOfRange(tokens, span.getStart(), span.getEnd()));
entities.add(new NamedEntity(entity, span.getType(), span.getStart(), span.getEnd()));
}
return entities;
}
public String[] chunk(String[] tokens, String[] posTags) {
return chunker.chunk(tokens, posTags);
}
public String[] lemmatize(String[] tokens, String[] posTags) {
return lemmatizer.lemmatize(tokens, posTags);
}
private void calculateStatistics(TextAnalysisResult result) {
int totalTokens = result.getSentenceAnalyses().stream()
.mapToInt(sa -> sa.getTokens().length)
.sum();
result.setTotalTokens(totalTokens);
int totalEntities = result.getSentenceAnalyses().stream()
.mapToInt(sa -> sa.getNamedEntities().size())
.sum();
result.setTotalNamedEntities(totalEntities);
// Calculate POS tag distribution
Map<String, Long> posDistribution = result.getSentenceAnalyses().stream()
.flatMap(sa -> Arrays.stream(sa.getPosTags()))
.collect(Collectors.groupingBy(tag -> tag, Collectors.counting()));
result.setPosDistribution(posDistribution);
}
@Override
public void close() {
// Clean up resources
sentenceDetector = null;
tokenizer = null;
posTagger = null;
nameFinder = null;
chunker = null;
lemmatizer = null;
parser = null;
initialized = false;
}
// Data transfer objects
public static class TextAnalysisResult {
private String originalText;
private String[] sentences;
private List<SentenceAnalysis> sentenceAnalyses;
private int totalTokens;
private int totalNamedEntities;
private Map<String, Long> posDistribution;
// Getters and setters
public String getOriginalText() { return originalText; }
public void setOriginalText(String originalText) { this.originalText = originalText; }
public String[] getSentences() { return sentences; }
public void setSentences(String[] sentences) { this.sentences = sentences; }
public List<SentenceAnalysis> getSentenceAnalyses() { return sentenceAnalyses; }
public void setSentenceAnalyses(List<SentenceAnalysis> sentenceAnalyses) { this.sentenceAnalyses = sentenceAnalyses; }
public int getTotalTokens() { return totalTokens; }
public void setTotalTokens(int totalTokens) { this.totalTokens = totalTokens; }
public int getTotalNamedEntities() { return totalNamedEntities; }
public void setTotalNamedEntities(int totalNamedEntities) { this.totalNamedEntities = totalNamedEntities; }
public Map<String, Long> getPosDistribution() { return posDistribution; }
public void setPosDistribution(Map<String, Long> posDistribution) { this.posDistribution = posDistribution; }
}
public static class SentenceAnalysis {
private String originalSentence;
private String[] tokens;
private String[] posTags;
private String[] lemmas;
private String[] chunks;
private List<NamedEntity> namedEntities;
// Getters and setters
public String getOriginalSentence() { return originalSentence; }
public void setOriginalSentence(String originalSentence) { this.originalSentence = originalSentence; }
public String[] getTokens() { return tokens; }
public void setTokens(String[] tokens) { this.tokens = tokens; }
public String[] getPosTags() { return posTags; }
public void setPosTags(String[] posTags) { this.posTags = posTags; }
public String[] getLemmas() { return lemmas; }
public void setLemmas(String[] lemmas) { this.lemmas = lemmas; }
public String[] getChunks() { return chunks; }
public void setChunks(String[] chunks) { this.chunks = chunks; }
public List<NamedEntity> getNamedEntities() { return namedEntities; }
public void setNamedEntities(List<NamedEntity> namedEntities) { this.namedEntities = namedEntities; }
}
public static class NamedEntity {
private final String text;
private final String type;
private final int start;
private final int end;
public NamedEntity(String text, String type, int start, int end) {
this.text = text;
this.type = type;
this.start = start;
this.end = end;
}
// Getters
public String getText() { return text; }
public String getType() { return type; }
public int getStart() { return start; }
public int getEnd() { return end; }
}
}

Custom Model Training

1. Named Entity Recognition Training

package com.opennlp.training;
import opennlp.tools.namefind.*;
import opennlp.tools.util.*;
import opennlp.tools.util.eval.FMeasure;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Collections;
public class NERTrainer {
/**
* Train a custom Named Entity Recognition model
*/
public void trainNERModel(Path trainingDataPath, Path modelOutputPath, 
String language) throws IOException {
// Prepare training parameters
TrainingParameters params = new TrainingParameters();
params.put(TrainingParameters.ITERATIONS_PARAM, "100");
params.put(TrainingParameters.CUTOFF_PARAM, "5");
params.put(TrainingParameters.ALGORITHM_PARAM, "MAXENT");
// Prepare training data
ObjectStream<NameSample> sampleStream = readTrainingData(trainingDataPath);
// Train the model
try {
TokenNameFinderModel model = NameFinderME.train(
language, 
"person", // entity type
sampleStream, 
params, 
new TokenNameFinderFactory()
);
// Save the model
try (OutputStream modelOut = Files.newOutputStream(modelOutputPath)) {
model.serialize(modelOut);
}
System.out.println("NER model trained and saved to: " + modelOutputPath);
} finally {
sampleStream.close();
}
}
private ObjectStream<NameSample> readTrainingData(Path trainingDataPath) throws IOException {
InputStream dataIn = Files.newInputStream(trainingDataPath);
return new NameSampleDataStream(
new PlainTextByLineStream(dataIn, StandardCharsets.UTF_8)
);
}
/**
* Evaluate model performance
*/
public ModelEvaluation evaluateNERModel(Path modelPath, Path testDataPath) throws IOException {
try (InputStream modelIn = Files.newInputStream(modelPath)) {
TokenNameFinderModel model = new TokenNameFinderModel(modelIn);
NameFinderME nameFinder = new NameFinderME(model);
ObjectStream<NameSample> testStream = readTrainingData(testDataPath);
FMeasure fmeasure = new FMeasure();
NameSample testSample;
while ((testSample = testStream.read()) != null) {
String[] tokens = testSample.getSentence();
opennlp.tools.util.Span[] predictedNames = nameFinder.find(tokens);
opennlp.tools.util.Span[] expectedNames = testSample.getNames();
fmeasure.updateScores(expectedNames, predictedNames);
}
testStream.close();
return new ModelEvaluation(
fmeasure.getPrecisionScore(),
fmeasure.getRecallScore(),
fmeasure.getFMeasure()
);
}
}
/**
* Generate training data in OpenNLP format
*/
public void generateTrainingData(List<NamedEntityExample> examples, Path outputPath) 
throws IOException {
try (BufferedWriter writer = Files.newBufferedWriter(outputPath, StandardCharsets.UTF_8)) {
for (NamedEntityExample example : examples) {
writer.write(example.toTrainingFormat());
writer.newLine();
}
}
}
public static class NamedEntityExample {
private final String sentence;
private final List<EntitySpan> entities;
public NamedEntityExample(String sentence, List<EntitySpan> entities) {
this.sentence = sentence;
this.entities = entities;
}
public String toTrainingFormat() {
String[] tokens = sentence.split("\\s+");
StringBuilder sb = new StringBuilder();
for (int i = 0; i < tokens.length; i++) {
sb.append(tokens[i]);
// Check if current token starts an entity
for (EntitySpan entity : entities) {
if (entity.start == i) {
sb.append(" <START:").append(entity.type).append(">");
} else if (entity.end == i) {
sb.append(" <END>");
}
}
sb.append(" ");
}
return sb.toString().trim();
}
}
public static class EntitySpan {
public final int start;
public final int end;
public final String type;
public EntitySpan(int start, int end, String type) {
this.start = start;
this.end = end;
this.type = type;
}
}
public static class ModelEvaluation {
private final double precision;
private final double recall;
private final double f1Score;
public ModelEvaluation(double precision, double recall, double f1Score) {
this.precision = precision;
this.recall = recall;
this.f1Score = f1Score;
}
// Getters
public double getPrecision() { return precision; }
public double getRecall() { return recall; }
public double getF1Score() { return f1Score; }
@Override
public String toString() {
return String.format("Precision: %.3f, Recall: %.3f, F1: %.3f", 
precision, recall, f1Score);
}
}
}

2. Sentiment Analysis Training

package com.opennlp.training;
import opennlp.tools.doccategorizer.*;
import opennlp.tools.util.*;
import opennlp.tools.util.eval.FMeasure;
import java.io.*;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
public class SentimentTrainer {
/**
* Train a sentiment analysis model
*/
public void trainSentimentModel(List<SentimentExample> trainingExamples, 
Path modelOutputPath) throws IOException {
// Prepare training data
ObjectStream<DocumentSample> sampleStream = 
new CollectionObjectStream<>(convertToDocumentSamples(trainingExamples));
// Training parameters
TrainingParameters params = new TrainingParameters();
params.put(TrainingParameters.ITERATIONS_PARAM, "100");
params.put(TrainingParameters.CUTOFF_PARAM, "2");
params.put(TrainingParameters.ALGORITHM_PARAM, "MAXENT");
// Feature generation
DoccatFactory factory = new DoccatFactory(new FeatureGenerator[]{
new BagOfWordsFeatureGenerator(),
new NGramFeatureGenerator(),
new NGramFeatureGenerator(1, 2) // bigrams
});
// Train model
DoccatModel model = DocumentCategorizerME.train(
"en", sampleStream, params, factory
);
// Save model
try (OutputStream modelOut = Files.newOutputStream(modelOutputPath)) {
model.serialize(modelOut);
}
System.out.println("Sentiment model trained and saved to: " + modelOutputPath);
}
private List<DocumentSample> convertToDocumentSamples(List<SentimentExample> examples) {
List<DocumentSample> samples = new ArrayList<>();
for (SentimentExample example : examples) {
samples.add(new DocumentSample(example.getCategory(), example.getText()));
}
return samples;
}
/**
* Cross-validate sentiment model
*/
public CrossValidationResult crossValidate(List<SentimentExample> examples, int folds) 
throws IOException {
CrossValidationPartitioner<DocumentSample> partitioner = 
new CrossValidationPartitioner<>(
convertToDocumentSamples(examples), folds
);
Map<String, FMeasure> fmeasures = new HashMap<>();
while (partitioner.hasNext()) {
CrossValidationPartitioner.TrainingSampleStream<DocumentSample> trainingStream = 
partitioner.next();
// Train on current fold
DoccatModel model = DocumentCategorizerME.train(
"en", trainingStream, ModelUtil.createDefaultParameters(), 
new DoccatFactory()
);
DocumentCategorizerME categorizer = new DocumentCategorizerME(model);
// Evaluate on test set
DocumentSample testSample;
while ((testSample = trainingStream.readTestSample()) != null) {
String[] tokens = testSample.getText().split("\\s+");
double[] outcomes = categorizer.categorize(tokens);
String predictedCategory = categorizer.getBestCategory(outcomes);
FMeasure fm = fmeasures.getOrDefault(testSample.getCategory(), new FMeasure());
fm.updateScores(testSample.getCategory(), predictedCategory);
fmeasures.put(testSample.getCategory(), fm);
}
}
return new CrossValidationResult(fmeasures);
}
public static class SentimentExample {
private final String text;
private final String category; // "positive", "negative", "neutral"
public SentimentExample(String text, String category) {
this.text = text;
this.category = category;
}
// Getters
public String getText() { return text; }
public String getCategory() { return category; }
}
public static class CrossValidationResult {
private final Map<String, CategoryMetrics> categoryMetrics;
private final double overallAccuracy;
public CrossValidationResult(Map<String, FMeasure> fmeasures) {
this.categoryMetrics = new HashMap<>();
double totalPrecision = 0;
double totalRecall = 0;
double totalF1 = 0;
int count = 0;
for (Map.Entry<String, FMeasure> entry : fmeasures.entrySet()) {
FMeasure fm = entry.getValue();
CategoryMetrics metrics = new CategoryMetrics(
fm.getPrecisionScore(),
fm.getRecallScore(),
fm.getFMeasure()
);
categoryMetrics.put(entry.getKey(), metrics);
totalPrecision += fm.getPrecisionScore();
totalRecall += fm.getRecallScore();
totalF1 += fm.getFMeasure();
count++;
}
this.overallAccuracy = count > 0 ? totalF1 / count : 0;
}
// Getters
public Map<String, CategoryMetrics> getCategoryMetrics() { return categoryMetrics; }
public double getOverallAccuracy() { return overallAccuracy; }
}
public static class CategoryMetrics {
private final double precision;
private final double recall;
private final double f1Score;
public CategoryMetrics(double precision, double recall, double f1Score) {
this.precision = precision;
this.recall = recall;
this.f1Score = f1Score;
}
// Getters
public double getPrecision() { return precision; }
public double getRecall() { return recall; }
public double getF1Score() { return f1Score; }
}
}

Real-World Applications

1. Document Classification System

package com.opennlp.applications;
import opennlp.tools.doccategorizer.DocumentCategorizerME;
import opennlp.tools.doccategorizer.DoccatModel;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import java.io.*;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
import java.util.stream.Collectors;
public class DocumentClassifier {
private DocumentCategorizerME categorizer;
private TokenizerME tokenizer;
private Map<String, Double> confidenceThresholds;
public void initialize(Path modelPath, Path tokenizerModelPath) throws IOException {
try (InputStream modelIn = Files.newInputStream(modelPath)) {
DoccatModel model = new DoccatModel(modelIn);
categorizer = new DocumentCategorizerME(model);
}
try (InputStream tokenizerIn = Files.newInputStream(tokenizerModelPath)) {
TokenizerModel tokenizerModel = new TokenizerModel(tokenizerIn);
tokenizer = new TokenizerME(tokenizerModel);
}
// Set confidence thresholds for different categories
confidenceThresholds = new HashMap<>();
confidenceThresholds.put("urgent", 0.8);
confidenceThresholds.put("important", 0.7);
confidenceThresholds.put("normal", 0.5);
confidenceThresholds.put("low", 0.3);
}
public ClassificationResult classifyDocument(String documentText) {
String[] tokens = tokenizer.tokenize(documentText);
double[] outcomes = categorizer.categorize(tokens);
String bestCategory = categorizer.getBestCategory(outcomes);
double confidence = outcomes[categorizer.getIndex(bestCategory)];
// Apply confidence threshold
double threshold = confidenceThresholds.getOrDefault(bestCategory, 0.5);
if (confidence < threshold) {
bestCategory = "uncertain";
}
// Get all categories with scores
Map<String, Double> allScores = new HashMap<>();
for (String category : categorizer.getCategories()) {
int index = categorizer.getIndex(category);
allScores.put(category, outcomes[index]);
}
return new ClassificationResult(bestCategory, confidence, allScores);
}
public List<DocumentClassification> batchClassify(List<Document> documents) {
return documents.parallelStream()
.map(doc -> {
ClassificationResult result = classifyDocument(doc.getContent());
return new DocumentClassification(doc, result);
})
.collect(Collectors.toList());
}
/**
* Extract key phrases from document based on classification
*/
public List<String> extractKeyPhrases(String documentText, String category) {
String[] sentences = documentText.split("[.!?]+");
List<String> keyPhrases = new ArrayList<>();
for (String sentence : sentences) {
ClassificationResult result = classifyDocument(sentence);
if (result.getCategory().equals(category) && 
result.getConfidence() > 0.7) {
keyPhrases.add(sentence.trim());
}
}
return keyPhrases;
}
public static class ClassificationResult {
private final String category;
private final double confidence;
private final Map<String, Double> allScores;
public ClassificationResult(String category, double confidence, 
Map<String, Double> allScores) {
this.category = category;
this.confidence = confidence;
this.allScores = Collections.unmodifiableMap(allScores);
}
// Getters
public String getCategory() { return category; }
public double getConfidence() { return confidence; }
public Map<String, Double> getAllScores() { return allScores; }
}
public static class Document {
private final String id;
private final String content;
private final String title;
public Document(String id, String title, String content) {
this.id = id;
this.title = title;
this.content = content;
}
// Getters
public String getId() { return id; }
public String getContent() { return content; }
public String getTitle() { return title; }
}
public static class DocumentClassification {
private final Document document;
private final ClassificationResult classification;
public DocumentClassification(Document document, ClassificationResult classification) {
this.document = document;
this.classification = classification;
}
// Getters
public Document getDocument() { return document; }
public ClassificationResult getClassification() { return classification; }
}
}

2. Intelligent Chatbot with NLP

package com.opennlp.applications;
import com.opennlp.core.NLPProcessor;
import opennlp.tools.doccategorizer.DocumentCategorizerME;
import opennlp.tools.doccategorizer.DoccatModel;
import java.io.*;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
import java.util.regex.Pattern;
public class IntelligentChatbot {
private NLPProcessor nlpProcessor;
private DocumentCategorizerME intentClassifier;
private Map<String, ResponseGenerator> responseGenerators;
private ChatHistory chatHistory;
public void initialize() throws Exception {
nlpProcessor = new NLPProcessor();
nlpProcessor.initialize();
// Load intent classification model
Path intentModelPath = Path.of("models/chatbot/intent-classifier.bin");
try (InputStream modelIn = Files.newInputStream(intentModelPath)) {
DoccatModel model = new DoccatModel(modelIn);
intentClassifier = new DocumentCategorizerME(model);
}
initializeResponseGenerators();
chatHistory = new ChatHistory();
}
private void initializeResponseGenerators() {
responseGenerators = new HashMap<>();
responseGenerators.put("greeting", new GreetingResponseGenerator());
responseGenerators.put("question", new QuestionResponseGenerator());
responseGenerators.put("complaint", new ComplaintResponseGenerator());
responseGenerators.put("feedback", new FeedbackResponseGenerator());
responseGenerators.put("goodbye", new GoodbyeResponseGenerator());
}
public ChatResponse processMessage(String userMessage, String userId) {
try {
// Analyze message with NLP
NLPProcessor.TextAnalysisResult analysis = nlpProcessor.analyzeText(userMessage);
// Classify intent
String intent = classifyIntent(userMessage);
// Extract entities
List<String> entities = extractEntities(analysis);
// Generate response
String response = generateResponse(intent, userMessage, entities, userId);
// Update chat history
chatHistory.addMessage(userId, userMessage, response, intent);
return new ChatResponse(response, intent, entities, analysis.getSentences());
} catch (Exception e) {
return new ChatResponse(
"I apologize, but I encountered an error processing your message.", 
"error", Collections.emptyList(), new String[0]
);
}
}
private String classifyIntent(String message) {
String[] tokens = nlpProcessor.tokenize(message);
double[] outcomes = intentClassifier.categorize(tokens);
return intentClassifier.getBestCategory(outcomes);
}
private List<String> extractEntities(NLPProcessor.TextAnalysisResult analysis) {
return analysis.getSentenceAnalyses().stream()
.flatMap(sa -> sa.getNamedEntities().stream())
.map(NLPProcessor.NamedEntity::getText)
.collect(Collectors.toList());
}
private String generateResponse(String intent, String userMessage, 
List<String> entities, String userId) {
ResponseGenerator generator = responseGenerators.get(intent);
if (generator != null) {
return generator.generateResponse(userMessage, entities, userId, chatHistory);
}
return "I understand you're saying: \"" + userMessage + 
"\". Could you please rephrase that?";
}
public ChatStatistics getChatStatistics(String userId) {
return chatHistory.getStatistics(userId);
}
// Response generator interfaces
public interface ResponseGenerator {
String generateResponse(String userMessage, List<String> entities, 
String userId, ChatHistory history);
}
public static class GreetingResponseGenerator implements ResponseGenerator {
private final Random random = new Random();
private final List<String> greetings = Arrays.asList(
"Hello! How can I help you today?",
"Hi there! What can I do for you?",
"Greetings! How may I assist you?",
"Welcome! How can I be of service?"
);
@Override
public String generateResponse(String userMessage, List<String> entities, 
String userId, ChatHistory history) {
return greetings.get(random.nextInt(greetings.size()));
}
}
public static class QuestionResponseGenerator implements ResponseGenerator {
@Override
public String generateResponse(String userMessage, List<String> entities, 
String userId, ChatHistory history) {
if (!entities.isEmpty()) {
return "Regarding " + entities.get(0) + ", I'd be happy to help. " +
"Could you provide more specific details about your question?";
}
return "That's an interesting question. Let me think about how best to assist you.";
}
}
// Chat history management
public static class ChatHistory {
private final Map<String, List<ChatMessage>> userHistories;
private final int maxHistorySize = 100;
public ChatHistory() {
this.userHistories = new HashMap<>();
}
public void addMessage(String userId, String userMessage, String botResponse, String intent) {
List<ChatMessage> history = userHistories.computeIfAbsent(userId, k -> new ArrayList<>());
history.add(new ChatMessage(userMessage, botResponse, intent, System.currentTimeMillis()));
// Limit history size
if (history.size() > maxHistorySize) {
history.remove(0);
}
}
public List<ChatMessage> getUserHistory(String userId) {
return Collections.unmodifiableList(
userHistories.getOrDefault(userId, Collections.emptyList())
);
}
public ChatStatistics getStatistics(String userId) {
List<ChatMessage> history = getUserHistory(userId);
Map<String, Long> intentCounts = history.stream()
.collect(Collectors.groupingBy(ChatMessage::getIntent, Collectors.counting()));
return new ChatStatistics(history.size(), intentCounts);
}
}
public static class ChatMessage {
private final String userMessage;
private final String botResponse;
private final String intent;
private final long timestamp;
public ChatMessage(String userMessage, String botResponse, String intent, long timestamp) {
this.userMessage = userMessage;
this.botResponse = botResponse;
this.intent = intent;
this.timestamp = timestamp;
}
// Getters
public String getUserMessage() { return userMessage; }
public String getBotResponse() { return botResponse; }
public String getIntent() { return intent; }
public long getTimestamp() { return timestamp; }
}
public static class ChatResponse {
private final String response;
private final String intent;
private final List<String> entities;
private final String[] sentences;
public ChatResponse(String response, String intent, List<String> entities, String[] sentences) {
this.response = response;
this.intent = intent;
this.entities = entities;
this.sentences = sentences;
}
// Getters
public String getResponse() { return response; }
public String getIntent() { return intent; }
public List<String> getEntities() { return entities; }
public String[] getSentences() { return sentences; }
}
public static class ChatStatistics {
private final int totalMessages;
private final Map<String, Long> intentDistribution;
public ChatStatistics(int totalMessages, Map<String, Long> intentDistribution) {
this.totalMessages = totalMessages;
this.intentDistribution = Collections.unmodifiableMap(intentDistribution);
}
// Getters
public int getTotalMessages() { return totalMessages; }
public Map<String, Long> getIntentDistribution() { return intentDistribution; }
}
}

Performance Optimization

1. Caching and Performance Monitoring

package com.opennlp.optimization;
import com.opennlp.core.NLPProcessor;
import java.util.*;
import java.util.concurrent.*;
public class OptimizedNLPProcessor {
private final NLPProcessor nlpProcessor;
private final Map<String, NLPProcessor.TextAnalysisResult> analysisCache;
private final ExecutorService processingPool;
private final PerformanceMonitor performanceMonitor;
private final int cacheSize;
public OptimizedNLPProcessor(int cacheSize, int threadPoolSize) throws Exception {
this.nlpProcessor = new NLPProcessor();
this.nlpProcessor.initialize();
this.cacheSize = cacheSize;
this.analysisCache = new LinkedHashMap<String, NLPProcessor.TextAnalysisResult>() {
@Override
protected boolean removeEldestEntry(Map.Entry<String, NLPProcessor.TextAnalysisResult> eldest) {
return size() > cacheSize;
}
};
this.processingPool = Executors.newFixedThreadPool(threadPoolSize);
this.performanceMonitor = new PerformanceMonitor();
}
public CompletableFuture<NLPProcessor.TextAnalysisResult> analyzeTextAsync(String text) {
String cacheKey = generateCacheKey(text);
// Check cache first
synchronized (analysisCache) {
NLPProcessor.TextAnalysisResult cached = analysisCache.get(cacheKey);
if (cached != null) {
performanceMonitor.recordCacheHit();
return CompletableFuture.completedFuture(cached);
}
}
performanceMonitor.recordCacheMiss();
return CompletableFuture.supplyAsync(() -> {
long startTime = System.nanoTime();
try {
NLPProcessor.TextAnalysisResult result = nlpProcessor.analyzeText(text);
// Cache the result
synchronized (analysisCache) {
analysisCache.put(cacheKey, result);
}
long processingTime = System.nanoTime() - startTime;
performanceMonitor.recordProcessingTime(processingTime);
return result;
} catch (Exception e) {
performanceMonitor.recordError();
throw new CompletionException("Text analysis failed", e);
}
}, processingPool);
}
public List<CompletableFuture<NLPProcessor.TextAnalysisResult>> analyzeBatchAsync(
List<String> texts) {
return texts.stream()
.map(this::analyzeTextAsync)
.collect(Collectors.toList());
}
private String generateCacheKey(String text) {
// Simple hash-based cache key
return Integer.toHexString(text.hashCode());
}
public PerformanceStats getPerformanceStats() {
return performanceMonitor.getStats();
}
public void clearCache() {
synchronized (analysisCache) {
analysisCache.clear();
}
}
public void shutdown() {
processingPool.shutdown();
try {
if (!processingPool.awaitTermination(5, TimeUnit.SECONDS)) {
processingPool.shutdownNow();
}
} catch (InterruptedException e) {
processingPool.shutdownNow();
Thread.currentThread().interrupt();
}
nlpProcessor.close();
}
public static class PerformanceMonitor {
private final AtomicLong cacheHits = new AtomicLong();
private final AtomicLong cacheMisses = new AtomicLong();
private final AtomicLong totalProcessingTime = new AtomicLong();
private final AtomicLong totalProcessedTexts = new AtomicLong();
private final AtomicLong errors = new AtomicLong();
public void recordCacheHit() {
cacheHits.incrementAndGet();
}
public void recordCacheMiss() {
cacheMisses.incrementAndGet();
}
public void recordProcessingTime(long nanos) {
totalProcessingTime.addAndGet(nanos);
totalProcessedTexts.incrementAndGet();
}
public void recordError() {
errors.incrementAndGet();
}
public PerformanceStats getStats() {
long hits = cacheHits.get();
long misses = cacheMisses.get();
long totalRequests = hits + misses;
double hitRatio = totalRequests > 0 ? (double) hits / totalRequests : 0.0;
double avgProcessingTime = totalProcessedTexts.get() > 0 ? 
(double) totalProcessingTime.get() / totalProcessedTexts.get() / 1_000_000 : 0.0;
return new PerformanceStats(
hits, misses, hitRatio, avgProcessingTime, errors.get()
);
}
}
public static class PerformanceStats {
private final long cacheHits;
private final long cacheMisses;
private final double cacheHitRatio;
private final double averageProcessingTimeMs;
private final long errorCount;
public PerformanceStats(long cacheHits, long cacheMisses, double cacheHitRatio,
double averageProcessingTimeMs, long errorCount) {
this.cacheHits = cacheHits;
this.cacheMisses = cacheMisses;
this.cacheHitRatio = cacheHitRatio;
this.averageProcessingTimeMs = averageProcessingTimeMs;
this.errorCount = errorCount;
}
// Getters
public long getCacheHits() { return cacheHits; }
public long getCacheMisses() { return cacheMisses; }
public double getCacheHitRatio() { return cacheHitRatio; }
public double getAverageProcessingTimeMs() { return averageProcessingTimeMs; }
public long getErrorCount() { return errorCount; }
@Override
public String toString() {
return String.format(
"Cache: %.1f%% hit ratio, Avg Time: %.2fms, Errors: %d",
cacheHitRatio * 100, averageProcessingTimeMs, errorCount
);
}
}
}

Conclusion

Apache OpenNLP provides robust NLP capabilities for Java applications:

Key Features:

  • Sentence Detection - Split text into sentences
  • Tokenization - Break text into words/tokens
  • Part-of-Speech Tagging - Identify grammatical components
  • Named Entity Recognition - Extract people, organizations, locations
  • Text Classification - Categorize documents and sentences
  • Model Training - Custom model development

Best Practices:

  • Use caching for frequently processed texts
  • Implement async processing for better performance
  • Monitor model performance and accuracy
  • Combine rule-based and ML-based approaches
  • Handle multiple languages with appropriate models

Use Cases:

  • Document processing and classification
  • Chatbots and virtual assistants
  • Sentiment analysis for social media
  • Information extraction from text
  • Content recommendation systems

OpenNLP's combination of pre-trained models and custom training capabilities makes it suitable for both rapid prototyping and production-grade NLP applications.

Leave a Reply

Your email address will not be published. Required fields are marked *


Macro Nepal Helper