Introduction
Named Entity Recognition (NER) is a fundamental Natural Language Processing task that identifies and classifies named entities in text into predefined categories like persons, organizations, locations, dates, etc. This article provides comprehensive implementations of NER systems in Java, from rule-based approaches to machine learning solutions.
Project Setup
Maven Dependencies
<!-- pom.xml -->
<properties>
<opennlp.version>2.3.2</opennlp.version>
<stanfordnlp.version>4.5.4</stanfordnlp.version>
<deeplearning4j.version>1.0.0-M2.1</deeplearning4j.version>
<jsoup.version>1.17.1</jsoup.version>
</properties>
<dependencies>
<!-- Apache OpenNLP -->
<dependency>
<groupId>org.apache.opennlp</groupId>
<artifactId>opennlp-tools</artifactId>
<version>${opennlp.version}</version>
</dependency>
<!-- Stanford CoreNLP -->
<dependency>
<groupId>edu.stanford.nlp</groupId>
<artifactId>stanford-corenlp</artifactId>
<version>${stanfordnlp.version}</version>
</dependency>
<dependency>
<groupId>edu.stanford.nlp</groupId>
<artifactId>stanford-corenlp</artifactId>
<version>${stanfordnlp.version}</version>
<classifier>models</classifier>
</dependency>
<!-- DeepLearning4J for Neural NER -->
<dependency>
<groupId>org.deeplearning4j</groupId>
<artifactId>deeplearning4j-core</artifactId>
<version>${deeplearning4j.version}</version>
</dependency>
<dependency>
<groupId>org.nd4j</groupId>
<artifactId>nd4j-native-platform</artifactId>
<version>${deeplearning4j.version}</version>
</dependency>
<!-- HTML Parsing -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>${jsoup.version}</version>
</dependency>
<!-- JSON Processing -->
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>2.15.2</version>
</dependency>
<!-- Logging -->
<dependency>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-classic</artifactId>
<version>1.4.11</version>
</dependency>
</dependencies>
Core NER Framework
Entity Types and Models
package com.ner.core;
public enum EntityType {
PERSON("Person"),
ORGANIZATION("Organization"),
LOCATION("Location"),
DATE("Date"),
TIME("Time"),
MONEY("Money"),
PERCENT("Percent"),
FACILITY("Facility"),
GPE("Geo-Political Entity"), // Countries, cities, states
NATIONALITY("Nationality"),
RELIGION("Religion"),
TITLE("Title"),
IDEOLOGY("Ideology"),
CRIMINAL_CHARGE("Criminal Charge"),
CAUSE_OF_DEATH("Cause of Death"),
NUMBER("Number"),
ORDINAL("Ordinal"),
DURATION("Duration"),
SET("Set"),
EMAIL("Email"),
URL("URL"),
PHONE("Phone"),
IP_ADDRESS("IP Address"),
UNKNOWN("Unknown");
private final String displayName;
EntityType(String displayName) {
this.displayName = displayName;
}
public String getDisplayName() {
return displayName;
}
public static EntityType fromString(String text) {
for (EntityType type : EntityType.values()) {
if (type.displayName.equalsIgnoreCase(text)) {
return type;
}
}
return UNKNOWN;
}
}
Named Entity Class
package com.ner.core;
import java.util.Objects;
public class NamedEntity {
private final String text;
private final EntityType type;
private final int startOffset;
private final int endOffset;
private final double confidence;
public NamedEntity(String text, EntityType type, int startOffset, int endOffset, double confidence) {
this.text = text;
this.type = type;
this.startOffset = startOffset;
this.endOffset = endOffset;
this.confidence = confidence;
}
public NamedEntity(String text, EntityType type, int startOffset, int endOffset) {
this(text, type, startOffset, endOffset, 1.0);
}
// Getters
public String getText() { return text; }
public EntityType getType() { return type; }
public int getStartOffset() { return startOffset; }
public int getEndOffset() { return endOffset; }
public double getConfidence() { return confidence; }
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
NamedEntity that = (NamedEntity) o;
return startOffset == that.startOffset &&
endOffset == that.endOffset &&
Objects.equals(text, that.text) &&
type == that.type;
}
@Override
public int hashCode() {
return Objects.hash(text, type, startOffset, endOffset);
}
@Override
public String toString() {
return String.format("[%s] %s (%d-%d) %.2f",
type.getDisplayName(), text, startOffset, endOffset, confidence);
}
}
Rule-Based NER System
Dictionary-Based NER
package com.ner.rulebased;
import com.ner.core.EntityType;
import com.ner.core.NamedEntity;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.*;
import java.util.regex.Pattern;
public class DictionaryNER {
private final Map<EntityType, Set<String>> entityDictionaries;
private final Map<String, EntityType> lookupCache;
private final List<Pattern> regexPatterns;
public DictionaryNER() {
this.entityDictionaries = new EnumMap<>(EntityType.class);
this.lookupCache = new HashMap<>();
this.regexPatterns = new ArrayList<>();
initializeDictionaries();
initializeRegexPatterns();
}
private void initializeDictionaries() {
// Load entity dictionaries from resources
loadDictionary("dictionaries/person.txt", EntityType.PERSON);
loadDictionary("dictionaries/organization.txt", EntityType.ORGANIZATION);
loadDictionary("dictionaries/location.txt", EntityType.LOCATION);
loadDictionary("dictionaries/countries.txt", EntityType.GPE);
loadDictionary("dictionaries/cities.txt", EntityType.LOCATION);
// Build lookup cache for fast access
for (Map.Entry<EntityType, Set<String>> entry : entityDictionaries.entrySet()) {
for (String entity : entry.getValue()) {
lookupCache.put(entity.toLowerCase(), entry.getKey());
}
}
}
private void loadDictionary(String resourcePath, EntityType type) {
try (InputStream is = getClass().getClassLoader().getResourceAsStream(resourcePath);
BufferedReader reader = new BufferedReader(new InputStreamReader(is))) {
Set<String> entities = new HashSet<>();
String line;
while ((line = reader.readLine()) != null) {
String entity = line.trim();
if (!entity.isEmpty()) {
entities.add(entity);
}
}
entityDictionaries.put(type, entities);
} catch (IOException | NullPointerException e) {
System.err.println("Warning: Could not load dictionary: " + resourcePath);
entityDictionaries.put(type, new HashSet<>());
}
}
private void initializeRegexPatterns() {
// Email pattern
regexPatterns.add(Pattern.compile("\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}\\b",
Pattern.CASE_INSENSITIVE));
// URL pattern
regexPatterns.add(Pattern.compile("https?://[\\w\\-\\.]+\\.[a-z]{2,}(/\\S*)?",
Pattern.CASE_INSENSITIVE));
// Phone number pattern (international format)
regexPatterns.add(Pattern.compile("\\+?[\\d\\s-()]{10,}",
Pattern.CASE_INSENSITIVE));
// Date patterns
regexPatterns.add(Pattern.compile("\\b\\d{1,2}[/-]\\d{1,2}[/-]\\d{2,4}\\b"));
regexPatterns.add(Pattern.compile("\\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \\d{1,2},? \\d{4}\\b",
Pattern.CASE_INSENSITIVE));
// Money pattern
regexPatterns.add(Pattern.compile("\\$\\d+(?:\\.\\d{1,2})?\\b|\\b\\d+(?:\\.\\d{1,2})?\\s*(?:dollars|USD)\\b",
Pattern.CASE_INSENSITIVE));
// IP address pattern
regexPatterns.add(Pattern.compile("\\b(?:[0-9]{1,3}\\.){3}[0-9]{1,3}\\b"));
}
public List<NamedEntity> extractEntities(String text) {
List<NamedEntity> entities = new ArrayList<>();
// Extract using dictionary lookup
entities.addAll(extractDictionaryEntities(text));
// Extract using regex patterns
entities.addAll(extractRegexEntities(text));
// Remove overlapping entities (keep the longest)
return mergeOverlappingEntities(entities);
}
private List<NamedEntity> extractDictionaryEntities(String text) {
List<NamedEntity> entities = new ArrayList<>();
String[] tokens = text.split("\\s+");
// Check n-grams from 1 to 5 words
for (int n = 5; n >= 1; n--) {
for (int i = 0; i <= tokens.length - n; i++) {
StringBuilder phraseBuilder = new StringBuilder();
for (int j = 0; j < n; j++) {
if (j > 0) phraseBuilder.append(" ");
phraseBuilder.append(tokens[i + j]);
}
String phrase = phraseBuilder.toString();
String normalizedPhrase = phrase.toLowerCase();
if (lookupCache.containsKey(normalizedPhrase)) {
EntityType type = lookupCache.get(normalizedPhrase);
int startOffset = text.indexOf(phrase);
if (startOffset != -1) {
entities.add(new NamedEntity(phrase, type, startOffset,
startOffset + phrase.length(), 0.9));
// Mark tokens as used to prevent shorter matches
for (int j = 0; j < n; j++) {
tokens[i + j] = ""; // Empty string to prevent reuse
}
}
}
}
}
return entities;
}
private List<NamedEntity> extractRegexEntities(String text) {
List<NamedEntity> entities = new ArrayList<>();
// Email addresses
var emailMatcher = regexPatterns.get(0).matcher(text);
while (emailMatcher.find()) {
entities.add(new NamedEntity(emailMatcher.group(), EntityType.EMAIL,
emailMatcher.start(), emailMatcher.end(), 0.95));
}
// URLs
var urlMatcher = regexPatterns.get(1).matcher(text);
while (urlMatcher.find()) {
entities.add(new NamedEntity(urlMatcher.group(), EntityType.URL,
urlMatcher.start(), urlMatcher.end(), 0.95));
}
// Phone numbers
var phoneMatcher = regexPatterns.get(2).matcher(text);
while (phoneMatcher.find()) {
entities.add(new NamedEntity(phoneMatcher.group(), EntityType.PHONE,
phoneMatcher.start(), phoneMatcher.end(), 0.8));
}
// Dates
var dateMatcher1 = regexPatterns.get(3).matcher(text);
while (dateMatcher1.find()) {
entities.add(new NamedEntity(dateMatcher1.group(), EntityType.DATE,
dateMatcher1.start(), dateMatcher1.end(), 0.7));
}
var dateMatcher2 = regexPatterns.get(4).matcher(text);
while (dateMatcher2.find()) {
entities.add(new NamedEntity(dateMatcher2.group(), EntityType.DATE,
dateMatcher2.start(), dateMatcher2.end(), 0.8));
}
// Money
var moneyMatcher = regexPatterns.get(5).matcher(text);
while (moneyMatcher.find()) {
entities.add(new NamedEntity(moneyMatcher.group(), EntityType.MONEY,
moneyMatcher.start(), moneyMatcher.end(), 0.75));
}
// IP addresses
var ipMatcher = regexPatterns.get(6).matcher(text);
while (ipMatcher.find()) {
entities.add(new NamedEntity(ipMatcher.group(), EntityType.IP_ADDRESS,
ipMatcher.start(), ipMatcher.end(), 0.9));
}
return entities;
}
private List<NamedEntity> mergeOverlappingEntities(List<NamedEntity> entities) {
entities.sort(Comparator.comparingInt(NamedEntity::getStartOffset));
List<NamedEntity> merged = new ArrayList<>();
NamedEntity current = null;
for (NamedEntity entity : entities) {
if (current == null) {
current = entity;
} else if (entity.getStartOffset() <= current.getEndOffset()) {
// Overlapping entities - keep the longer one
if (entity.getEndOffset() - entity.getStartOffset() >
current.getEndOffset() - current.getStartOffset()) {
current = entity;
}
} else {
merged.add(current);
current = entity;
}
}
if (current != null) {
merged.add(current);
}
return merged;
}
public void addEntity(String entity, EntityType type) {
String normalized = entity.toLowerCase();
entityDictionaries.computeIfAbsent(type, k -> new HashSet<>()).add(entity);
lookupCache.put(normalized, type);
}
public void removeEntity(String entity) {
String normalized = entity.toLowerCase();
lookupCache.remove(normalized);
for (Set<String> entities : entityDictionaries.values()) {
entities.remove(entity);
}
}
public Set<String> getEntitiesByType(EntityType type) {
return Collections.unmodifiableSet(entityDictionaries.getOrDefault(type, new HashSet<>()));
}
}
Machine Learning NER with OpenNLP
OpenNLP NER Implementation
package com.ner.ml;
import com.ner.core.EntityType;
import com.ner.core.NamedEntity;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.util.Span;
import java.io.InputStream;
import java.util.*;
public class OpenNLPNER {
private final NameFinderME personFinder;
private final NameFinderME organizationFinder;
private final NameFinderME locationFinder;
private final NameFinderME dateFinder;
private final NameFinderME timeFinder;
private final NameFinderME moneyFinder;
private final NameFinderME percentageFinder;
private final Tokenizer tokenizer;
public OpenNLPNER() {
try {
// Load pre-trained models
this.personFinder = createNameFinder("models/en-ner-person.bin");
this.organizationFinder = createNameFinder("models/en-ner-organization.bin");
this.locationFinder = createNameFinder("models/en-ner-location.bin");
this.dateFinder = createNameFinder("models/en-ner-date.bin");
this.timeFinder = createNameFinder("models/en-ner-time.bin");
this.moneyFinder = createNameFinder("models/en-ner-money.bin");
this.percentageFinder = createNameFinder("models/en-ner-percentage.bin");
// Load tokenizer
try (InputStream tokenModelIn = getClass().getClassLoader()
.getResourceAsStream("models/en-token.bin")) {
TokenizerModel tokenModel = new TokenizerModel(tokenModelIn);
this.tokenizer = new TokenizerME(tokenModel);
}
} catch (Exception e) {
throw new RuntimeException("Failed to initialize OpenNLP NER", e);
}
}
private NameFinderME createNameFinder(String modelPath) throws Exception {
try (InputStream modelIn = getClass().getClassLoader().getResourceAsStream(modelPath)) {
if (modelIn == null) {
throw new RuntimeException("Model not found: " + modelPath);
}
TokenNameFinderModel model = new TokenNameFinderModel(modelIn);
return new NameFinderME(model);
}
}
public List<NamedEntity> extractEntities(String text) {
List<NamedEntity> entities = new ArrayList<>();
// Tokenize text
String[] tokens = tokenizer.tokenize(text);
Span[] tokenSpans = tokenizer.tokenizePos(text);
// Extract entities using different finders
entities.addAll(extractEntitiesWithFinder(tokens, tokenSpans, personFinder, EntityType.PERSON));
entities.addAll(extractEntitiesWithFinder(tokens, tokenSpans, organizationFinder, EntityType.ORGANIZATION));
entities.addAll(extractEntitiesWithFinder(tokens, tokenSpans, locationFinder, EntityType.LOCATION));
entities.addAll(extractEntitiesWithFinder(tokens, tokenSpans, dateFinder, EntityType.DATE));
entities.addAll(extractEntitiesWithFinder(tokens, tokenSpans, timeFinder, EntityType.TIME));
entities.addAll(extractEntitiesWithFinder(tokens, tokenSpans, moneyFinder, EntityType.MONEY));
entities.addAll(extractEntitiesWithFinder(tokens, tokenSpans, percentageFinder, EntityType.PERCENT));
// Merge overlapping entities and sort by position
return mergeAndSortEntities(entities);
}
private List<NamedEntity> extractEntitiesWithFinder(String[] tokens, Span[] tokenSpans,
NameFinderME finder, EntityType type) {
List<NamedEntity> entities = new ArrayList<>();
try {
Span[] spans = finder.find(tokens);
double[] probabilities = finder.probs();
for (int i = 0; i < spans.length; i++) {
Span span = spans[i];
double confidence = probabilities[i];
// Convert token spans to character offsets
int startOffset = tokenSpans[span.getStart()].getStart();
int endOffset = tokenSpans[span.getEnd() - 1].getEnd();
// Extract entity text
String entityText = String.join(" ",
Arrays.copyOfRange(tokens, span.getStart(), span.getEnd()));
entities.add(new NamedEntity(entityText, type, startOffset, endOffset, confidence));
}
// Clear adaptive data to maintain model consistency
finder.clearAdaptiveData();
} catch (Exception e) {
System.err.println("Error extracting " + type + " entities: " + e.getMessage());
}
return entities;
}
private List<NamedEntity> mergeAndSortEntities(List<NamedEntity> entities) {
// Remove duplicates and sort by start offset
Set<NamedEntity> uniqueEntities = new TreeSet<>(
Comparator.comparingInt(NamedEntity::getStartOffset)
.thenComparingInt(NamedEntity::getEndOffset)
);
uniqueEntities.addAll(entities);
return new ArrayList<>(uniqueEntities);
}
public Map<EntityType, List<NamedEntity>> extractEntitiesByType(String text) {
Map<EntityType, List<NamedEntity>> entitiesByType = new EnumMap<>(EntityType.class);
List<NamedEntity> allEntities = extractEntities(text);
for (NamedEntity entity : allEntities) {
entitiesByType.computeIfAbsent(entity.getType(), k -> new ArrayList<>()).add(entity);
}
return entitiesByType;
}
public void close() {
// Clean up resources
personFinder.clearAdaptiveData();
organizationFinder.clearAdaptiveData();
locationFinder.clearAdaptiveData();
dateFinder.clearAdaptiveData();
timeFinder.clearAdaptiveData();
moneyFinder.clearAdaptiveData();
percentageFinder.clearAdaptiveData();
}
}
Stanford CoreNLP NER Implementation
package com.ner.ml;
import com.ner.core.EntityType;
import com.ner.core.NamedEntity;
import edu.stanford.nlp.coref.CorefCoreAnnotations;
import edu.stanford.nlp.coref.data.CorefChain;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;
import java.util.*;
public class StanfordNER {
private final StanfordCoreNLP pipeline;
public StanfordNER() {
// Set up pipeline properties
Properties props = new Properties();
props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner, coref");
props.setProperty("coref.algorithm", "statistical");
props.setProperty("ner.useSUTime", "false");
this.pipeline = new StanfordCoreNLP(props);
}
public StanfordNER(Properties customProperties) {
this.pipeline = new StanfordCoreNLP(customProperties);
}
public List<NamedEntity> extractEntities(String text) {
List<NamedEntity> entities = new ArrayList<>();
// Create annotation object
Annotation document = new Annotation(text);
// Run all Annotators on this text
pipeline.annotate(document);
// Process sentences
List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
for (CoreMap sentence : sentences) {
entities.addAll(extractEntitiesFromSentence(sentence));
}
// Resolve coreferences
resolveCoreferences(entities, document);
return entities;
}
private List<NamedEntity> extractEntitiesFromSentence(CoreMap sentence) {
List<NamedEntity> entities = new ArrayList<>();
List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
NamedEntity currentEntity = null;
for (int i = 0; i < tokens.size(); i++) {
CoreLabel token = tokens.get(i);
String nerTag = token.get(CoreAnnotations.NamedEntityTagAnnotation.class);
if (!"O".equals(nerTag)) {
// Start of a new entity or continuation of current entity
EntityType entityType = mapStanfordNERtoEntityType(nerTag);
String tokenText = token.get(CoreAnnotations.TextAnnotation.class);
int startOffset = token.beginPosition();
int endOffset = token.endPosition();
if (currentEntity != null && currentEntity.getType() == entityType) {
// Continue current entity
String mergedText = currentEntity.getText() + " " + tokenText;
currentEntity = new NamedEntity(mergedText, entityType,
currentEntity.getStartOffset(), endOffset, 0.9);
} else {
// Start new entity
if (currentEntity != null) {
entities.add(currentEntity);
}
currentEntity = new NamedEntity(tokenText, entityType, startOffset, endOffset, 0.9);
}
} else {
// End of current entity
if (currentEntity != null) {
entities.add(currentEntity);
currentEntity = null;
}
}
}
// Add the last entity if exists
if (currentEntity != null) {
entities.add(currentEntity);
}
return entities;
}
private EntityType mapStanfordNERtoEntityType(String stanfordTag) {
switch (stanfordTag) {
case "PERSON": return EntityType.PERSON;
case "ORGANIZATION": return EntityType.ORGANIZATION;
case "LOCATION": return EntityType.LOCATION;
case "CITY": return EntityType.LOCATION;
case "STATE_OR_PROVINCE": return EntityType.LOCATION;
case "COUNTRY": return EntityType.GPE;
case "NATIONALITY": return EntityType.NATIONALITY;
case "DATE": return EntityType.DATE;
case "TIME": return EntityType.TIME;
case "MONEY": return EntityType.MONEY;
case "PERCENT": return EntityType.PERCENT;
case "CAUSE_OF_DEATH": return EntityType.CAUSE_OF_DEATH;
case "CRIMINAL_CHARGE": return EntityType.CRIMINAL_CHARGE;
case "TITLE": return EntityType.TITLE;
case "IDEOLOGY": return EntityType.IDEOLOGY;
case "RELIGION": return EntityType.RELIGION;
case "NUMBER": return EntityType.NUMBER;
case "ORDINAL": return EntityType.ORDINAL;
case "DURATION": return EntityType.DURATION;
case "SET": return EntityType.SET;
default: return EntityType.UNKNOWN;
}
}
private void resolveCoreferences(List<NamedEntity> entities, Annotation document) {
Map<Integer, CorefChain> corefChains = document.get(CorefCoreAnnotations.CorefChainAnnotation.class);
if (corefChains == null) return;
// Create mapping from entity mentions to coreference chains
Map<NamedEntity, Integer> entityToChain = new HashMap<>();
for (CorefChain chain : corefChains.values()) {
CorefChain.CorefMention representative = chain.getRepresentativeMention();
if (representative != null) {
String representativeText = representative.mentionSpan;
// Find entities that match this representative
for (NamedEntity entity : entities) {
if (entity.getText().equalsIgnoreCase(representativeText)) {
entityToChain.put(entity, chain.getChainID());
}
}
}
}
// Group entities by coreference chain
Map<Integer, List<NamedEntity>> chainToEntities = new HashMap<>();
for (Map.Entry<NamedEntity, Integer> entry : entityToChain.entrySet()) {
chainToEntities.computeIfAbsent(entry.getValue(), k -> new ArrayList<>())
.add(entry.getKey());
}
// You can use this coreference information for entity linking or consolidation
// For now, we just store the information in the entities
}
public Map<String, List<NamedEntity>> extractEntitiesGroupedByText(String text) {
List<NamedEntity> entities = extractEntities(text);
Map<String, List<NamedEntity>> grouped = new HashMap<>();
for (NamedEntity entity : entities) {
grouped.computeIfAbsent(entity.getText(), k -> new ArrayList<>()).add(entity);
}
return grouped;
}
public void analyzeText(String text) {
Annotation document = new Annotation(text);
pipeline.annotate(document);
List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
for (CoreMap sentence : sentences) {
System.out.println("Sentence: " + sentence.get(CoreAnnotations.TextAnnotation.class));
for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
String word = token.get(CoreAnnotations.TextAnnotation.class);
String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);
String ner = token.get(CoreAnnotations.NamedEntityTagAnnotation.class);
if (!"O".equals(ner)) {
System.out.printf(" %s/%s/%s%n", word, pos, ner);
}
}
}
}
}
Hybrid NER System
package com.ner.hybrid;
import com.ner.core.EntityType;
import com.ner.core.NamedEntity;
import com.ner.ml.OpenNLPNER;
import com.ner.ml.StanfordNER;
import com.ner.rulebased.DictionaryNER;
import java.util.*;
public class HybridNER {
private final DictionaryNER dictionaryNER;
private final OpenNLPNER openNlpNER;
private final StanfordNER stanfordNER;
private final boolean useDictionary;
private final boolean useOpenNLP;
private final boolean useStanford;
public HybridNER() {
this(true, true, true);
}
public HybridNER(boolean useDictionary, boolean useOpenNLP, boolean useStanford) {
this.useDictionary = useDictionary;
this.useOpenNLP = useOpenNLP;
this.useStanford = useStanford;
this.dictionaryNER = useDictionary ? new DictionaryNER() : null;
this.openNlpNER = useOpenNLP ? new OpenNLPNER() : null;
this.stanfordNER = useStanford ? new StanfordNER() : null;
}
public List<NamedEntity> extractEntities(String text) {
Set<NamedEntity> allEntities = new HashSet<>();
if (useDictionary) {
allEntities.addAll(dictionaryNER.extractEntities(text));
}
if (useOpenNLP) {
allEntities.addAll(openNlpNER.extractEntities(text));
}
if (useStanford) {
allEntities.addAll(stanfordNER.extractEntities(text));
}
// Resolve conflicts and merge results
return resolveConflicts(new ArrayList<>(allEntities));
}
private List<NamedEntity> resolveConflicts(List<NamedEntity> entities) {
entities.sort(Comparator.comparingInt(NamedEntity::getStartOffset));
List<NamedEntity> resolved = new ArrayList<>();
NamedEntity current = null;
for (NamedEntity entity : entities) {
if (current == null) {
current = entity;
} else if (isOverlapping(current, entity)) {
// Conflict resolution strategy
current = resolveEntityConflict(current, entity);
} else {
resolved.add(current);
current = entity;
}
}
if (current != null) {
resolved.add(current);
}
return resolved;
}
private boolean isOverlapping(NamedEntity e1, NamedEntity e2) {
return !(e1.getEndOffset() <= e2.getStartOffset() ||
e2.getEndOffset() <= e1.getStartOffset());
}
private NamedEntity resolveEntityConflict(NamedEntity e1, NamedEntity e2) {
// Prefer longer entities
int len1 = e1.getEndOffset() - e1.getStartOffset();
int len2 = e2.getEndOffset() - e2.getStartOffset();
if (len1 > len2) {
return e1;
} else if (len2 > len1) {
return e2;
}
// Prefer higher confidence
if (e1.getConfidence() > e2.getConfidence()) {
return e1;
} else if (e2.getConfidence() > e1.getConfidence()) {
return e2;
}
// Prefer specific types over general ones
if (isMoreSpecific(e1.getType(), e2.getType())) {
return e1;
} else if (isMoreSpecific(e2.getType(), e1.getType())) {
return e2;
}
// Default to first entity
return e1;
}
private boolean isMoreSpecific(EntityType type1, EntityType type2) {
// Define specificity hierarchy
Map<EntityType, Integer> specificity = Map.of(
EntityType.PERSON, 10,
EntityType.ORGANIZATION, 9,
EntityType.LOCATION, 8,
EntityType.GPE, 7,
EntityType.DATE, 6,
EntityType.MONEY, 5,
EntityType.UNKNOWN, 0
);
return specificity.getOrDefault(type1, 0) > specificity.getOrDefault(type2, 0);
}
public Map<EntityType, List<NamedEntity>> extractEntitiesByType(String text) {
List<NamedEntity> entities = extractEntities(text);
Map<EntityType, List<NamedEntity>> entitiesByType = new EnumMap<>(EntityType.class);
for (NamedEntity entity : entities) {
entitiesByType.computeIfAbsent(entity.getType(), k -> new ArrayList<>()).add(entity);
}
return entitiesByType;
}
public NERResult analyzeTextWithConfidence(String text) {
List<NamedEntity> entities = extractEntities(text);
double overallConfidence = calculateOverallConfidence(entities);
return new NERResult(text, entities, overallConfidence);
}
private double calculateOverallConfidence(List<NamedEntity> entities) {
if (entities.isEmpty()) return 0.0;
double sum = 0.0;
for (NamedEntity entity : entities) {
sum += entity.getConfidence();
}
return sum / entities.size();
}
public static class NERResult {
private final String text;
private final List<NamedEntity> entities;
private final double overallConfidence;
private final Date timestamp;
public NERResult(String text, List<NamedEntity> entities, double overallConfidence) {
this.text = text;
this.entities = entities;
this.overallConfidence = overallConfidence;
this.timestamp = new Date();
}
// Getters
public String getText() { return text; }
public List<NamedEntity> getEntities() { return entities; }
public double getOverallConfidence() { return overallConfidence; }
public Date getTimestamp() { return timestamp; }
public Map<EntityType, Integer> getEntityCounts() {
Map<EntityType, Integer> counts = new EnumMap<>(EntityType.class);
for (NamedEntity entity : entities) {
counts.put(entity.getType(), counts.getOrDefault(entity.getType(), 0) + 1);
}
return counts;
}
}
public void close() {
if (openNlpNER != null) {
openNlpNER.close();
}
}
}
Web Application for NER
Spring Boot REST API
package com.ner.api;
import com.ner.core.EntityType;
import com.ner.core.NamedEntity;
import com.ner.hybrid.HybridNER;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.jsoup.Jsoup;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.web.bind.annotation.*;
import org.springframework.web.multipart.MultipartFile;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
@SpringBootApplication
@RestController
@RequestMapping("/api/ner")
public class NERApplication {
private final HybridNER ner;
private final ObjectMapper objectMapper;
public NERApplication() {
this.ner = new HybridNER();
this.objectMapper = new ObjectMapper();
}
public static void main(String[] args) {
SpringApplication.run(NERApplication.class, args);
}
@PostMapping("/extract")
public NERResponse extractEntities(@RequestBody TextRequest request) {
try {
String text = request.getText();
boolean cleanHtml = request.isCleanHtml();
if (cleanHtml) {
text = Jsoup.parse(text).text();
}
List<NamedEntity> entities = ner.extractEntities(text);
Map<EntityType, List<NamedEntity>> entitiesByType = entities.stream()
.collect(Collectors.groupingBy(NamedEntity::getType));
return new NERResponse(true, "Entities extracted successfully",
entities, entitiesByType, text);
} catch (Exception e) {
return new NERResponse(false, "Error extracting entities: " + e.getMessage(),
null, null, null);
}
}
@PostMapping("/extract-file")
public NERResponse extractFromFile(@RequestParam("file") MultipartFile file) {
try {
String content = new String(file.getBytes());
String cleanedContent = Jsoup.parse(content).text();
List<NamedEntity> entities = ner.extractEntities(cleanedContent);
Map<EntityType, List<NamedEntity>> entitiesByType = entities.stream()
.collect(Collectors.groupingBy(NamedEntity::getType));
return new NERResponse(true, "Entities extracted from file successfully",
entities, entitiesByType, cleanedContent);
} catch (Exception e) {
return new NERResponse(false, "Error processing file: " + e.getMessage(),
null, null, null);
}
}
@GetMapping("/types")
public Map<String, String> getEntityTypes() {
return Map.of(
"PERSON", "People, including fictional",
"ORGANIZATION", "Companies, agencies, institutions",
"LOCATION", "Physical locations",
"DATE", "Absolute or relative dates or periods",
"MONEY", "Monetary values",
"PERCENT", "Percentage values",
"TIME", "Times smaller than a day",
"EMAIL", "Email addresses",
"URL", "Web addresses",
"PHONE", "Phone numbers"
);
}
// Request and Response classes
public static class TextRequest {
private String text;
private boolean cleanHtml = true;
// Getters and setters
public String getText() { return text; }
public void setText(String text) { this.text = text; }
public boolean isCleanHtml() { return cleanHtml; }
public void setCleanHtml(boolean cleanHtml) { this.cleanHtml = cleanHtml; }
}
public static class NERResponse {
private boolean success;
private String message;
private List<NamedEntity> entities;
private Map<EntityType, List<NamedEntity>> entitiesByType;
private String processedText;
private long timestamp;
public NERResponse(boolean success, String message, List<NamedEntity> entities,
Map<EntityType, List<NamedEntity>> entitiesByType, String processedText) {
this.success = success;
this.message = message;
this.entities = entities;
this.entitiesByType = entitiesByType;
this.processedText = processedText;
this.timestamp = System.currentTimeMillis();
}
// Getters and setters
public boolean isSuccess() { return success; }
public String getMessage() { return message; }
public List<NamedEntity> getEntities() { return entities; }
public Map<EntityType, List<NamedEntity>> getEntitiesByType() { return entitiesByType; }
public String getProcessedText() { return processedText; }
public long getTimestamp() { return timestamp; }
}
}
Performance Evaluation
package com.ner.evaluation;
import com.ner.core.EntityType;
import com.ner.core.NamedEntity;
import com.ner.hybrid.HybridNER;
import java.util.List;
import java.util.Map;
public class NEREvaluator {
public static class EvaluationResult {
private final int truePositives;
private final int falsePositives;
private final int falseNegatives;
private final double precision;
private final double recall;
private final double f1Score;
public EvaluationResult(int tp, int fp, int fn) {
this.truePositives = tp;
this.falsePositives = fp;
this.falseNegatives = fn;
this.precision = tp + fp > 0 ? (double) tp / (tp + fp) : 0.0;
this.recall = tp + fn > 0 ? (double) tp / (tp + fn) : 0.0;
this.f1Score = (precision + recall) > 0 ?
2 * precision * recall / (precision + recall) : 0.0;
}
// Getters
public int getTruePositives() { return truePositives; }
public int getFalsePositives() { return falsePositives; }
public int getFalseNegatives() { return falseNegatives; }
public double getPrecision() { return precision; }
public double getRecall() { return recall; }
public double getF1Score() { return f1Score; }
}
public static EvaluationResult evaluate(HybridNER ner, String text,
List<NamedEntity> groundTruth) {
List<NamedEntity> predictions = ner.extractEntities(text);
int truePositives = 0;
int falsePositives = 0;
int falseNegatives = 0;
// Count true positives
for (NamedEntity predicted : predictions) {
boolean found = false;
for (NamedEntity truth : groundTruth) {
if (isMatch(predicted, truth)) {
truePositives++;
found = true;
break;
}
}
if (!found) {
falsePositives++;
}
}
// Count false negatives
for (NamedEntity truth : groundTruth) {
boolean found = false;
for (NamedEntity predicted : predictions) {
if (isMatch(predicted, truth)) {
found = true;
break;
}
}
if (!found) {
falseNegatives++;
}
}
return new EvaluationResult(truePositives, falsePositives, falseNegatives);
}
private static boolean isMatch(NamedEntity e1, NamedEntity e2) {
// Consider entities matching if they overlap and have same type
return e1.getType() == e2.getType() &&
isOverlapping(e1, e2) &&
textSimilarity(e1.getText(), e2.getText()) > 0.7;
}
private static boolean isOverlapping(NamedEntity e1, NamedEntity e2) {
return !(e1.getEndOffset() <= e2.getStartOffset() ||
e2.getEndOffset() <= e1.getStartOffset());
}
private static double textSimilarity(String s1, String s2) {
String normalized1 = s1.toLowerCase().trim();
String normalized2 = s2.toLowerCase().trim();
if (normalized1.equals(normalized2)) return 1.0;
// Simple Jaccard similarity
Set<Character> set1 = new HashSet<>();
Set<Character> set2 = new HashSet<>();
for (char c : normalized1.toCharArray()) set1.add(c);
for (char c : normalized2.toCharArray()) set2.add(c);
Set<Character> intersection = new HashSet<>(set1);
intersection.retainAll(set2);
Set<Character> union = new HashSet<>(set1);
union.addAll(set2);
return union.isEmpty() ? 0.0 : (double) intersection.size() / union.size();
}
}
Usage Example
package com.ner.demo;
import com.ner.core.EntityType;
import com.ner.core.NamedEntity;
import com.ner.hybrid.HybridNER;
import java.util.List;
import java.util.Map;
public class NERDemo {
public static void main(String[] args) {
HybridNER ner = new HybridNER();
String text = "Apple Inc. was founded by Steve Jobs, Steve Wozniak, and Ronald Wayne " +
"on April 1, 1976 in Cupertino, California. The company's revenue reached " +
"$365.8 billion in 2021. You can contact them at [email protected] or " +
"visit their website at https://www.apple.com.";
System.out.println("Text: " + text);
System.out.println("\nExtracted Entities:");
System.out.println("===================");
List<NamedEntity> entities = ner.extractEntities(text);
for (NamedEntity entity : entities) {
System.out.printf("%-15s: %s (confidence: %.2f)%n",
entity.getType().getDisplayName(),
entity.getText(),
entity.getConfidence());
}
System.out.println("\nEntities by Type:");
System.out.println("=================");
Map<EntityType, List<NamedEntity>> entitiesByType = ner.extractEntitiesByType(text);
for (Map.Entry<EntityType, List<NamedEntity>> entry : entitiesByType.entrySet()) {
System.out.printf("%s (%d):%n", entry.getKey().getDisplayName(), entry.getValue().size());
for (NamedEntity entity : entry.getValue()) {
System.out.printf(" - %s%n", entity.getText());
}
}
ner.close();
}
}
Summary
This comprehensive Named Entity Recognition system provides:
- Multiple Approaches: Rule-based, machine learning, and hybrid systems
- Entity Types: Support for 20+ entity types including custom types
- Integration: Apache OpenNLP, Stanford CoreNLP, and custom rule-based systems
- REST API: Spring Boot-based web service for easy integration
- Evaluation: Performance metrics and testing framework
- Production Ready: Error handling, logging, and resource management
The system can be easily extended with custom entity types, domain-specific dictionaries, and additional machine learning models for specialized use cases.