Markdown to HTML Converter in Java

A comprehensive and extensible Markdown to HTML converter implementation in Java with support for CommonMark specification and GitHub Flavored Markdown (GFM) features.

Complete Implementation

1. Main Converter Class

package com.markdownconverter;
import java.util.*;
import java.util.regex.*;
import java.io.*;
import java.nio.file.*;
/**
* Advanced Markdown to HTML Converter
* Supports CommonMark specification with GFM extensions
*/
public class MarkdownToHtmlConverter {
private static final Map<String, String> HTML_ENTITIES = Map.of(
"&", "&amp;",
"<", "&lt;",
">", "&gt;",
"\"", "&quot;",
"'", "&#39;"
);
private boolean supportGFM = true;
private boolean sanitizeHTML = true;
private boolean allowRawHTML = false;
public MarkdownToHtmlConverter() {}
public MarkdownToHtmlConverter(boolean supportGFM, boolean sanitizeHTML, boolean allowRawHTML) {
this.supportGFM = supportGFM;
this.sanitizeHTML = sanitizeHTML;
this.allowRawHTML = allowRawHTML;
}
/**
* Converts Markdown text to HTML
*/
public String convert(String markdown) {
if (markdown == null || markdown.trim().isEmpty()) {
return "";
}
String[] lines = markdown.split("\r?\n");
StringBuilder html = new StringBuilder();
List<String> processedLines = new ArrayList<>();
// First pass: process blocks
processedLines.addAll(processBlocks(lines));
// Second pass: join and process inline elements
String processedContent = String.join("\n", processedLines);
processedContent = processInlineElements(processedContent);
html.append(processedContent);
return html.toString();
}
/**
* Converts a Markdown file to HTML file
*/
public void convertFile(String inputFile, String outputFile) throws IOException {
String markdown = Files.readString(Paths.get(inputFile));
String html = convert(markdown);
Files.writeString(Paths.get(outputFile), html);
}
/**
* Process block-level elements
*/
private List<String> processBlocks(String[] lines) {
List<String> result = new ArrayList<>();
int i = 0;
while (i < lines.length) {
String line = lines[i];
if (isHorizontalRule(line)) {
result.add("<hr>");
i++;
} else if (isHeading(line)) {
i = processHeading(lines, i, result);
} else if (isBlockquote(line)) {
i = processBlockquote(lines, i, result);
} else if (isCodeBlock(line)) {
i = processCodeBlock(lines, i, result);
} else if (isTable(line) && supportGFM) {
i = processTable(lines, i, result);
} else if (isList(line)) {
i = processList(lines, i, result);
} else {
i = processParagraph(lines, i, result);
}
}
return result;
}
}

2. Block Element Processing

    /**
* Process headings (#, ##, ###, etc.)
*/
private int processHeading(String[] lines, int index, List<String> result) {
String line = lines[index];
Pattern headingPattern = Pattern.compile("^(#{1,6})\\s+(.*)$");
Matcher matcher = headingPattern.matcher(line);
if (matcher.find()) {
int level = matcher.group(1).length();
String text = matcher.group(2).trim();
String id = generateHeadingId(text);
result.add(String.format("<h%d id=\"%s\">%s</h%d>", level, id, text, level));
return index + 1;
}
// Alternative syntax (underlined)
if (index + 1 < lines.length) {
String nextLine = lines[index + 1];
if (nextLine.matches("^=+$")) {
result.add(String.format("<h1>%s</h1>", line.trim()));
return index + 2;
} else if (nextLine.matches("^-+$")) {
result.add(String.format("<h2>%s</h2>", line.trim()));
return index + 2;
}
}
return processParagraph(lines, index, result);
}
/**
* Process paragraphs
*/
private int processParagraph(String[] lines, int index, List<String> result) {
StringBuilder paragraph = new StringBuilder();
int i = index;
while (i < lines.length && !isBlockElement(lines[i])) {
if (!lines[i].trim().isEmpty()) {
if (paragraph.length() > 0) paragraph.append(" ");
paragraph.append(lines[i].trim());
}
i++;
}
if (paragraph.length() > 0) {
result.add("<p>" + paragraph.toString() + "</p>");
}
return i;
}
/**
* Process code blocks (```)
*/
private int processCodeBlock(String[] lines, int index, List<String> result) {
String line = lines[index];
if (!line.trim().startsWith("```")) {
return processParagraph(lines, index, result);
}
// Extract language if specified
String language = "";
if (line.trim().length() > 3) {
language = line.trim().substring(3).trim();
}
StringBuilder codeContent = new StringBuilder();
int i = index + 1;
// Read until closing ```
while (i < lines.length && !lines[i].trim().equals("```")) {
codeContent.append(escapeHtml(lines[i])).append("\n");
i++;
}
if (i < lines.length) {
i++; // Skip the closing ```
}
String codeClass = language.isEmpty() ? "" : String.format(" class=\"language-%s\"", language);
result.add(String.format("<pre><code%s>%s</code></pre>", codeClass, codeContent.toString().trim()));
return i;
}
/**
* Process blockquotes (>)
*/
private int processBlockquote(String[] lines, int index, List<String> result) {
StringBuilder blockquote = new StringBuilder();
int i = index;
while (i < lines.length && (isBlockquote(lines[i]) || lines[i].trim().isEmpty())) {
String line = lines[i];
if (isBlockquote(line)) {
String content = line.replaceFirst("^>\\s*", "");
blockquote.append(content.trim()).append("\n");
}
i++;
}
// Recursively process the blockquote content
String innerHtml = convert(blockquote.toString().trim());
innerHtml = innerHtml.replaceAll("^<p>|</p>$", ""); // Remove wrapping <p> tags
result.add("<blockquote>" + innerHtml + "</blockquote>");
return i;
}

3. List Processing

    /**
* Process ordered and unordered lists
*/
private int processList(String[] lines, int index, List<String> result) {
String line = lines[index];
boolean isOrdered = isOrderedList(line);
StringBuilder listHtml = new StringBuilder();
List<ListItem> items = parseListItems(lines, index);
if (isOrdered) {
listHtml.append("<ol>\n");
} else {
listHtml.append("<ul>\n");
}
for (ListItem item : items) {
listHtml.append("  <li>");
// Process nested content
String content = item.content.trim();
if (content.contains("\n")) {
// Multi-line list item - need to process as markdown
content = convert(content);
content = content.replaceAll("^<p>|</p>$", "");
} else {
content = processInlineElements(content);
}
listHtml.append(content);
listHtml.append("</li>\n");
}
if (isOrdered) {
listHtml.append("</ol>");
} else {
listHtml.append("</ul>");
}
result.add(listHtml.toString());
return index + items.size();
}
private List<ListItem> parseListItems(String[] lines, int startIndex) {
List<ListItem> items = new ArrayList<>();
int i = startIndex;
while (i < lines.length && isList(lines[i])) {
String line = lines[i];
int indent = getIndentationLevel(line);
String content = line.replaceFirst("^\\s*([*+-]|\\d+\\.)\\s+", "");
// Check for multi-line list items
StringBuilder itemContent = new StringBuilder(content);
i++;
while (i < lines.length && !isList(lines[i]) && !lines[i].trim().isEmpty()) {
// Continuation line
itemContent.append(" ").append(lines[i].trim());
i++;
}
items.add(new ListItem(indent, itemContent.toString()));
}
return items;
}
private static class ListItem {
int indent;
String content;
ListItem(int indent, String content) {
this.indent = indent;
this.content = content;
}
}

4. Table Processing (GFM)

    /**
* Process GitHub Flavored Markdown tables
*/
private int processTable(String[] lines, int index, List<String> result) {
List<String> tableLines = new ArrayList<>();
int i = index;
// Collect all table rows
while (i < lines.length && isTableRow(lines[i])) {
tableLines.add(lines[i]);
i++;
}
if (tableLines.size() < 2) {
return processParagraph(lines, index, result);
}
StringBuilder tableHtml = new StringBuilder();
tableHtml.append("<table>\n");
// Process header
String headerLine = tableLines.get(0);
tableHtml.append("  <thead>\n    <tr>\n");
String[] headers = parseTableRow(headerLine);
for (String header : headers) {
tableHtml.append("      <th>").append(processInlineElements(header.trim())).append("</th>\n");
}
tableHtml.append("    </tr>\n  </thead>\n");
// Process separator (second line)
if (tableLines.size() > 1) {
String separatorLine = tableLines.get(1);
if (!isTableSeparator(separatorLine)) {
// Invalid table format
return processParagraph(lines, index, result);
}
}
// Process body
if (tableLines.size() > 2) {
tableHtml.append("  <tbody>\n");
for (int j = 2; j < tableLines.size(); j++) {
tableHtml.append("    <tr>\n");
String[] cells = parseTableRow(tableLines.get(j));
for (String cell : cells) {
tableHtml.append("      <td>").append(processInlineElements(cell.trim())).append("</td>\n");
}
tableHtml.append("    </tr>\n");
}
tableHtml.append("  </tbody>\n");
}
tableHtml.append("</table>");
result.add(tableHtml.toString());
return i;
}
private String[] parseTableRow(String line) {
// Split by | but ignore escaped pipes
return Arrays.stream(line.split("(?<!\\\\)\\|"))
.map(cell -> cell.replace("\\|", "|")) // Unescape pipes
.map(String::trim)
.filter(cell -> !cell.isEmpty())
.toArray(String[]::new);
}
private boolean isTableSeparator(String line) {
return line.matches("^\\s*\\|?\\s*:?-+:?\\s*(\\|\\s*:?-+:?\\s*)*\\|?\\s*$");
}

5. Inline Element Processing

    /**
* Process inline elements (bold, italic, links, etc.)
*/
private String processInlineElements(String text) {
if (text == null || text.isEmpty()) {
return "";
}
String result = text;
// Escape HTML entities first
result = escapeHtml(result);
// Process code spans (`code`)
result = processCodeSpans(result);
// Process links [text](url)
result = processLinks(result);
// Process images ![alt](src)
result = processImages(result);
// Process bold and italic
result = processEmphasis(result);
// Process strikethrough (GFM)
if (supportGFM) {
result = processStrikethrough(result);
}
return result;
}
private String processCodeSpans(String text) {
Pattern pattern = Pattern.compile("`([^`]+)`");
Matcher matcher = pattern.matcher(text);
StringBuffer sb = new StringBuffer();
while (matcher.find()) {
String code = matcher.group(1);
matcher.appendReplacement(sb, "<code>" + escapeHtml(code) + "</code>");
}
matcher.appendTail(sb);
return sb.toString();
}
private String processLinks(String text) {
// [text](url "title")
Pattern pattern = Pattern.compile("\\[([^\\]]+)\\]\\(([^\\s)]+)(?:\\s+\"([^\"]+)\")?\\)");
Matcher matcher = pattern.matcher(text);
StringBuffer sb = new StringBuffer();
while (matcher.find()) {
String linkText = matcher.group(1);
String url = matcher.group(2);
String title = matcher.group(3);
String titleAttr = title != null ? " title=\"" + escapeHtml(title) + "\"" : "";
String replacement = String.format("<a href=\"%s\"%s>%s</a>", 
escapeHtml(url), titleAttr, processInlineElements(linkText));
matcher.appendReplacement(sb, Matcher.quoteReplacement(replacement));
}
matcher.appendTail(sb);
return sb.toString();
}
private String processImages(String text) {
// ![alt](src "title")
Pattern pattern = Pattern.compile("!\\[([^\\]]*)\\]\\(([^\\s)]+)(?:\\s+\"([^\"]+)\")?\\)");
Matcher matcher = pattern.matcher(text);
StringBuffer sb = new StringBuffer();
while (matcher.find()) {
String alt = matcher.group(1);
String src = matcher.group(2);
String title = matcher.group(3);
String titleAttr = title != null ? " title=\"" + escapeHtml(title) + "\"" : "";
String replacement = String.format("<img src=\"%s\" alt=\"%s\"%s>", 
escapeHtml(src), escapeHtml(alt), titleAttr);
matcher.appendReplacement(sb, Matcher.quoteReplacement(replacement));
}
matcher.appendTail(sb);
return sb.toString();
}
private String processEmphasis(String text) {
String result = text;
// Process bold first: **bold** or __bold__
result = processEmphasisType(result, "**", "<strong>", "</strong>");
result = processEmphasisType(result, "__", "<strong>", "</strong>");
// Process italic: *italic* or _italic_
result = processEmphasisType(result, "*", "<em>", "</em>");
result = processEmphasisType(result, "_", "<em>", "</em>");
return result;
}
private String processEmphasisType(String text, String delimiter, String openTag, String closeTag) {
Pattern pattern = Pattern.compile(Pattern.quote(delimiter) + "([^" + Pattern.quote(delimiter) + "]+)" + Pattern.quote(delimiter));
Matcher matcher = pattern.matcher(text);
StringBuffer sb = new StringBuffer();
while (matcher.find()) {
String content = matcher.group(1);
// Recursively process nested emphasis
content = processInlineElements(content);
matcher.appendReplacement(sb, Matcher.quoteReplacement(openTag + content + closeTag));
}
matcher.appendTail(sb);
return sb.toString();
}
private String processStrikethrough(String text) {
Pattern pattern = Pattern.compile("~~([^~]+)~~");
Matcher matcher = pattern.matcher(text);
StringBuffer sb = new StringBuffer();
while (matcher.find()) {
String content = matcher.group(1);
content = processInlineElements(content);
matcher.appendReplacement(sb, Matcher.quoteReplacement("<del>" + content + "</del>"));
}
matcher.appendTail(sb);
return sb.toString();
}

6. Utility Methods

    /**
* Utility methods for element detection
*/
private boolean isBlockElement(String line) {
return isHeading(line) || isBlockquote(line) || isCodeBlock(line) || 
isHorizontalRule(line) || isList(line) || (isTable(line) && supportGFM);
}
private boolean isHeading(String line) {
return line.matches("^#{1,6}\\s+.*") || 
(line.trim().length() > 0 && !line.trim().startsWith("```"));
}
private boolean isBlockquote(String line) {
return line.trim().startsWith(">");
}
private boolean isCodeBlock(String line) {
return line.trim().startsWith("```");
}
private boolean isHorizontalRule(String line) {
return line.matches("^\\s*([*_-]\\s*){3,}\\s*$");
}
private boolean isList(String line) {
return line.matches("^\\s*([*+-]|\\d+\\.)\\s+.*");
}
private boolean isOrderedList(String line) {
return line.matches("^\\s*\\d+\\.\\s+.*");
}
private boolean isTable(String line) {
return line.contains("|");
}
private boolean isTableRow(String line) {
return line.contains("|") && !isTableSeparator(line);
}
private int getIndentationLevel(String line) {
int count = 0;
for (char c : line.toCharArray()) {
if (c == ' ') count++;
else if (c == '\t') count += 4;
else break;
}
return count;
}
private String escapeHtml(String text) {
if (!sanitizeHTML) return text;
String result = text;
for (Map.Entry<String, String> entity : HTML_ENTITIES.entrySet()) {
result = result.replace(entity.getKey(), entity.getValue());
}
return result;
}
private String generateHeadingId(String text) {
return text.toLowerCase()
.replaceAll("[^a-z0-9\\s-]", "")
.replaceAll("\\s+", "-")
.replaceAll("-+", "-")
.replaceAll("^-|-$", "");
}

7. Command Line Interface

/**
* Command Line Interface for Markdown to HTML Converter
*/
public class MarkdownCLI {
public static void main(String[] args) {
if (args.length == 0) {
printHelp();
return;
}
MarkdownToHtmlConverter converter = new MarkdownToHtmlConverter();
boolean gfm = true;
boolean sanitize = true;
for (int i = 0; i < args.length; i++) {
switch (args[i]) {
case "-i":
case "--input":
if (i + 1 < args.length) {
String inputFile = args[++i];
String outputFile = getOutputFile(inputFile, args, i);
try {
converter.convertFile(inputFile, outputFile);
System.out.println("Converted: " + inputFile + " -> " + outputFile);
} catch (IOException e) {
System.err.println("Error converting file: " + e.getMessage());
}
}
break;
case "--no-gfm":
gfm = false;
break;
case "--allow-raw-html":
sanitize = false;
break;
case "-h":
case "--help":
printHelp();
return;
default:
// Treat as direct markdown input
if (!args[i].startsWith("-")) {
String html = converter.convert(args[i]);
System.out.println(html);
}
break;
}
}
}
private static String getOutputFile(String inputFile, String[] args, int currentIndex) {
// Check if output file is specified
if (currentIndex + 2 < args.length && ("-o".equals(args[currentIndex + 1]) || "--output".equals(args[currentIndex + 1]))) {
return args[currentIndex + 2];
}
// Generate output filename
if (inputFile.endsWith(".md")) {
return inputFile.substring(0, inputFile.length() - 3) + ".html";
} else {
return inputFile + ".html";
}
}
private static void printHelp() {
System.out.println("Markdown to HTML Converter");
System.out.println("Usage:");
System.out.println("  java MarkdownCLI -i input.md [-o output.html]");
System.out.println("  java MarkdownCLI \"# Markdown text\"");
System.out.println();
System.out.println("Options:");
System.out.println("  -i, --input FILE     Input Markdown file");
System.out.println("  -o, --output FILE    Output HTML file (default: input file with .html extension)");
System.out.println("  --no-gfm             Disable GitHub Flavored Markdown extensions");
System.out.println("  --allow-raw-html     Allow raw HTML in Markdown (disables sanitization)");
System.out.println("  -h, --help          Show this help message");
}
}

8. Advanced Features Extension

/**
* Extended converter with additional features
*/
public class ExtendedMarkdownConverter extends MarkdownToHtmlConverter {
private boolean supportFootnotes = true;
private boolean supportTaskLists = true;
private boolean supportEmoji = true;
private static final Map<String, String> EMOJI_MAP = Map.of(
":smile:", "😄",
":heart:", "❤️",
":warning:", "⚠️",
":rocket:", "🚀",
":thumbsup:", "👍"
// Add more emojis as needed
);
public ExtendedMarkdownConverter() {
super(true, true, false);
}
@Override
protected String processInlineElements(String text) {
String result = super.processInlineElements(text);
if (supportEmoji) {
result = processEmojis(result);
}
if (supportTaskLists) {
result = processTaskLists(result);
}
return result;
}
private String processEmojis(String text) {
String result = text;
for (Map.Entry<String, String> emoji : EMOJI_MAP.entrySet()) {
result = result.replace(emoji.getKey(), emoji.getValue());
}
return result;
}
private String processTaskLists(String text) {
// Process task lists in list items
Pattern pattern = Pattern.compile("\\[([ xX])\\]\\s*(.*)");
Matcher matcher = pattern.matcher(text);
StringBuffer sb = new StringBuffer();
while (matcher.find()) {
String checked = matcher.group(1).trim().equalsIgnoreCase("x") ? " checked" : "";
String task = matcher.group(2);
String replacement = String.format("<input type=\"checkbox\" disabled%s> %s", checked, task);
matcher.appendReplacement(sb, Matcher.quoteReplacement(replacement));
}
matcher.appendTail(sb);
return sb.toString();
}
/**
* Convert with HTML template wrapper
*/
public String convertWithTemplate(String markdown, String title, String cssFile) {
String content = convert(markdown);
return String.format("""
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>%s</title>
<link rel="stylesheet" href="%s">
</head>
<body>
<article class="markdown-body">
%s
</article>
</body>
</html>
""", escapeHtml(title), escapeHtml(cssFile), content);
}
}

Usage Examples

Basic Usage

MarkdownToHtmlConverter converter = new MarkdownToHtmlConverter();
String markdown = "# Hello World\n\nThis is **bold** and *italic* text.";
String html = converter.convert(markdown);
System.out.println(html);

File Conversion

MarkdownToHtmlConverter converter = new MarkdownToHtmlConverter();
converter.convertFile("README.md", "README.html");

Command Line Usage

# Convert file
java MarkdownCLI -i README.md -o output.html
# Convert direct text
java MarkdownCLI "# Heading" "This is a paragraph."

Features Supported

  • Headings (ATX and Setext styles)
  • Paragraphs and line breaks
  • Emphasis (bold, italic)
  • Lists (ordered and unordered)
  • Links and images
  • Code blocks and inline code
  • Blockquotes
  • Horizontal rules
  • Tables (GFM)
  • Strikethrough (GFM)
  • Task lists (Extended)
  • Emoji (Extended)
  • HTML sanitization
  • File I/O operations

This implementation provides a robust, extensible foundation for Markdown processing that can be easily customized for specific needs.

Leave a Reply

Your email address will not be published. Required fields are marked *


Macro Nepal Helper