I'm working on adding structured data to question pages on solvin (more specifically implementing QAPage).
Google's structured data documentation states:
Don't mark up content that is not visible to readers of the page.
This implies that markdown syntax (e.g. the #
in # headings
, *
in *bolding*/**italics**
, the tildes in code blocks, etc.) should be stripped from the body of questions before being added to structured data. This means I need to find a way to convert the raw user-generated markdown to plaintext, or, in other words, "remove" the markdown. How can I achieve this in TypeScript?
There are multiple markdown to text libraries on npm (markdown-to-text, markdown-to-txt, remove-markdown), however they all proved either buggy in edge cases or unmaintained and built on old versions of libraries.
Instead I ended up writing my own marked Renderer
that outputs plaintext:
import {
Lexer,
marked,
Parser,
Renderer,
type MarkedOptions,
type Tokens,
TextRenderer
} from "marked";
/**
* Convert a Markdown string to plain text using the custom renderer.
* @param markdownString The Markdown input.
* @returns The resulting plain text.
*/
export function markdownToPlaintext(markdownString: string): string {
// Tokenize input markdown
const tokens = Lexer.lex(markdownString);
// Parse tokens into plain text
const parser = new Parser({ renderer: new PlainTextRenderer() });
return parser.parse(tokens);
}
/**
* PlainTextRenderer: Renders markdown as plain text.
*/
export class PlainTextRenderer extends Renderer<string, string> {
constructor(options?: MarkedOptions<string, string>) {
super(options);
}
space(_token: Tokens.Space): string {
return "";
}
code({ text }: Tokens.Code): string {
return text + "\n\n";
}
blockquote({ tokens }: Tokens.Blockquote): string {
const body = this.parser.parse(tokens);
return (
body
.split("\n")
.map((line) => "> " + line)
.join("\n") + "\n\n"
);
}
html({ text }: Tokens.HTML | Tokens.Tag): string {
return "";
}
heading({ tokens }: Tokens.Heading): string {
return this.parser.parseInline(tokens) + "\n\n";
}
hr(_token: Tokens.Hr): string {
return "---\n\n";
}
list(token: Tokens.List): string {
return (
token.items
.map((item, i) =>
this.listitem(item, token.ordered, i, token.start)
)
.join("") + "\n"
);
}
listitem(
item: Tokens.ListItem,
ordered = false,
index = 0,
start: number | "" = 1
): string {
let prefix: string;
if (ordered) {
const num = typeof start === "number" ? start + index : index + 1;
prefix = `${num}. `;
} else {
prefix = "- ";
}
const body = this.parser.parse(item.tokens, item.loose);
return prefix + body.trim().replace(/\n/g, "\n ") + "\n";
}
checkbox({ checked }: Tokens.Checkbox): string {
return checked ? "[x]" : "[ ]";
}
paragraph({ tokens }: Tokens.Paragraph): string {
return this.parser.parseInline(tokens) + "\n\n";
}
// Still a little unsure about the table formatting
table(token: Tokens.Table): string {
const rows = [
token.header
.map((cell) => this.parser.parseInline(cell.tokens))
.join(" | "),
token.align
.map((a) =>
a
? a === "left"
? ":--"
: a === "right"
? "--:"
: ":-:"
: "---"
)
.join(" | "),
...token.rows.map((row) =>
row
.map((cell) => this.parser.parseInline(cell.tokens))
.join(" | ")
)
];
return rows.join("\n") + "\n\n";
}
tablerow({ text }: Tokens.TableRow<string>): string {
return text + "\n";
}
tablecell(token: Tokens.TableCell): string {
return this.parser.parseInline(token.tokens);
}
strong({ tokens }: Tokens.Strong): string {
return this.parser.parseInline(tokens);
}
em({ tokens }: Tokens.Em): string {
return this.parser.parseInline(tokens);
}
codespan({ text }: Tokens.Codespan): string {
return text;
}
br(_token: Tokens.Br): string {
return "\n";
}
del({ tokens }: Tokens.Del): string {
return this.parser.parseInline(tokens);
}
link({ href, tokens }: Tokens.Link): string {
const text = this.parser.parseInline(tokens);
return `${text} (${href})`;
}
image({ href, text }: Tokens.Image): string {
return `${text} (${href})`;
}
text(token: Tokens.Text | Tokens.Escape): string {
if ("tokens" in token && token.tokens) {
return this.parser.parseInline(token.tokens);
}
return token.text;
}
}