How can I convert raw markdown to plaintext in TypeScript/JavaScript?

Asked 3 days ago Viewed 7 times

I'm working on adding structured data to question pages on solvin (more specifically implementing QAPage).

Google's structured data documentation states:

Don't mark up content that is not visible to readers of the page.

This implies that markdown syntax (e.g. the # in # headings, * in *bolding*/**italics**, the tildes in code blocks, etc.) should be stripped from the body of questions before being added to structured data. This means I need to find a way to convert the raw user-generated markdown to plaintext, or, in other words, "remove" the markdown. How can I achieve this in TypeScript?

0
s
56
1 Answer
Sort by
Accepted by Author Posted 2 days ago

There are multiple markdown to text libraries on npm (markdown-to-text, markdown-to-txt, remove-markdown), however they all proved either buggy in edge cases or unmaintained and built on old versions of libraries.

Instead I ended up writing my own marked Renderer that outputs plaintext:

import {
    Lexer,
    marked,
    Parser,
    Renderer,
    type MarkedOptions,
    type Tokens,
    TextRenderer
} from "marked";

/**
 * Convert a Markdown string to plain text using the custom renderer.
 * @param markdownString The Markdown input.
 * @returns The resulting plain text.
 */
export function markdownToPlaintext(markdownString: string): string {
    // Tokenize input markdown
    const tokens = Lexer.lex(markdownString);

    // Parse tokens into plain text
    const parser = new Parser({ renderer: new PlainTextRenderer() });

    return parser.parse(tokens);
}

/**
 * PlainTextRenderer: Renders markdown as plain text.
 */
export class PlainTextRenderer extends Renderer<string, string> {
    constructor(options?: MarkedOptions<string, string>) {
        super(options);
    }

    space(_token: Tokens.Space): string {
        return "";
    }

    code({ text }: Tokens.Code): string {
        return text + "\n\n";
    }

    blockquote({ tokens }: Tokens.Blockquote): string {
        const body = this.parser.parse(tokens);
        return (
            body
                .split("\n")
                .map((line) => "> " + line)
                .join("\n") + "\n\n"
        );
    }

    html({ text }: Tokens.HTML | Tokens.Tag): string {
        return "";
    }

    heading({ tokens }: Tokens.Heading): string {
        return this.parser.parseInline(tokens) + "\n\n";
    }

    hr(_token: Tokens.Hr): string {
        return "---\n\n";
    }

    list(token: Tokens.List): string {
        return (
            token.items
                .map((item, i) =>
                    this.listitem(item, token.ordered, i, token.start)
                )
                .join("") + "\n"
        );
    }

    listitem(
        item: Tokens.ListItem,
        ordered = false,
        index = 0,
        start: number | "" = 1
    ): string {
        let prefix: string;
        if (ordered) {
            const num = typeof start === "number" ? start + index : index + 1;
            prefix = `${num}. `;
        } else {
            prefix = "- ";
        }

        const body = this.parser.parse(item.tokens, item.loose);
        return prefix + body.trim().replace(/\n/g, "\n  ") + "\n";
    }

    checkbox({ checked }: Tokens.Checkbox): string {
        return checked ? "[x]" : "[ ]";
    }

    paragraph({ tokens }: Tokens.Paragraph): string {
        return this.parser.parseInline(tokens) + "\n\n";
    }

    // Still a little unsure about the table formatting
    table(token: Tokens.Table): string {
        const rows = [
            token.header
                .map((cell) => this.parser.parseInline(cell.tokens))
                .join(" | "),
            token.align
                .map((a) =>
                    a
                        ? a === "left"
                            ? ":--"
                            : a === "right"
                              ? "--:"
                              : ":-:"
                        : "---"
                )
                .join(" | "),
            ...token.rows.map((row) =>
                row
                    .map((cell) => this.parser.parseInline(cell.tokens))
                    .join(" | ")
            )
        ];
        return rows.join("\n") + "\n\n";
    }

    tablerow({ text }: Tokens.TableRow<string>): string {
        return text + "\n";
    }

    tablecell(token: Tokens.TableCell): string {
        return this.parser.parseInline(token.tokens);
    }

    strong({ tokens }: Tokens.Strong): string {
        return this.parser.parseInline(tokens);
    }

    em({ tokens }: Tokens.Em): string {
        return this.parser.parseInline(tokens);
    }

    codespan({ text }: Tokens.Codespan): string {
        return text;
    }

    br(_token: Tokens.Br): string {
        return "\n";
    }

    del({ tokens }: Tokens.Del): string {
        return this.parser.parseInline(tokens);
    }

    link({ href, tokens }: Tokens.Link): string {
        const text = this.parser.parseInline(tokens);
        return `${text} (${href})`;
    }

    image({ href, text }: Tokens.Image): string {
        return `${text} (${href})`;
    }

    text(token: Tokens.Text | Tokens.Escape): string {
        if ("tokens" in token && token.tokens) {
            return this.parser.parseInline(token.tokens);
        }
        return token.text;
    }
}
0
s
56

Your Answer

You Must Log In or Sign Up to Answer Questions